In [8]:
# importing different libraries
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.svm import SVC, LinearSVC
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stop_words = stopwords.words('english')

## In this notebook, we are experimenting with differnt SVMs with differnt vectors.

In [4]:
train_df = pd.read_csv('../input/midas-task/reddit_data.csv')
train_df.dropna(inplace=True)
le = preprocessing.LabelEncoder()
le.fit(train_df["flair"])
train_df["label"] = le.transform(train_df["flair"])

In [5]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train_df.text, train_df.label, 
                                                  stratify=train_df.label, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [6]:
# function for tokenization, we are also using stemming to reduce no. of unique tokens
# And we are using Porter Stemmer from NLTK for stemming
def tokenize(text):
    tokens = word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [9]:
# Initialising the TF-IDF Vectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None, tokenizer = tokenize,
                    strip_accents='unicode', analyzer='word',token_pattern=None,
                    ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)
# We have not used stopwords argument to remove stopwords in tfidf because the text which we are using are title
# of posts and because of this the no. of words are itself less so, there is no point in reducing no. of words further. 

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))

# Now transforming to TF-IDF Vectors
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

# saving the trained vectorizer model
filename = 'tfidf_vectors.sav'
joblib.dump(tfv, filename)

['tfidf_vectors.sav']

In [10]:
# Initialising the Count Vectorizer
ctv = CountVectorizer(analyzer='word',tokenizer = tokenize, ngram_range=(1, 3))
# Same as TF-IDF Vectorizer, here also we have not removed stopwords

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))

#Now transforming to Count Vectors
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

# saving the trained vectorizer model
filename = 'count_vectors.sav'
joblib.dump(ctv, filename)



['count_vectors.sav']

## Truncated SVD with TF-IDF

In [11]:
# Initialising SVD
svd = TruncatedSVD(n_components=200)

# Fitting the TF-IDF vectors
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

# saving the SVD model
filename = 'trun_svd.sav'
joblib.dump(svd, filename)

# saving the Scaler model
filename = 'scaler.sav'
joblib.dump(scl, filename)

['scaler.sav']

## SGD Classifier

### SGD with SVD and Scaler

In [14]:
clf = SGDClassifier(alpha=1e-3, random_state=42, max_iter=30, tol=None)

# fitting the data
clf.fit(xtrain_svd_scl, ytrain)

# predicting on validation data
predictions = clf.predict(xvalid_svd_scl)

# calculating accuracies
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the SGD classifer
filename = 'sgd_classifier.sav'
joblib.dump(clf, filename)

accuracy 0.5131587455544778


['sgd_classifier.sav']

### SGD with SVD only

In [15]:
clf = SGDClassifier(alpha=1e-3, random_state=42, max_iter=30, tol=None)

# fitting the data
clf.fit(xtrain_svd, ytrain)

# predicting on validation data
predictions = clf.predict(xvalid_svd)

# calculating accuracies
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the SGD classifer
filename = 'sgd_classifier.sav'
joblib.dump(clf, filename)

accuracy 0.4813449725185904


['sgd_classifier.sav']

### SGD with TF-IDF vectors

In [19]:
clf = SGDClassifier(alpha=1e-3, random_state=42, max_iter=30)

# fitting the data
clf.fit(xtrain_tfv, ytrain)

# predicting on validation data
predictions = clf.predict(xvalid_tfv)

# calculating accuracies
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the SGD classifer
filename = 'sgd_classifier.sav'
joblib.dump(clf, filename)

accuracy 0.5776268994503718


['sgd_classifier.sav']

### SGD with Count vectors

In [20]:
clf = SGDClassifier(alpha=1e-3, random_state=42, max_iter=30)

# fitting the data
clf.fit(xtrain_ctv, ytrain)

# predicting on validation data
predictions = clf.predict(xvalid_ctv)

# calculating accuracies
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the SGD classifer
filename = 'sgd_classifier.sav'
joblib.dump(clf, filename)

accuracy 0.6069188490139024


['sgd_classifier.sav']

In [22]:
xtrain_ctv.shape, xvalid_tfv

((61856, 799833),
 <15465x53649 sparse matrix of type '<class 'numpy.float64'>'
 	with 166339 stored elements in Compressed Sparse Row format>)

### Linear SVC with Count vectors

In [26]:
# We are Linear Support Vector Classifcation(LinearSVC) in it instead of SVM because SVM is very slow still.
clf = LinearSVC()

# fitting the data
clf.fit(xtrain_ctv, ytrain)

# predicting on validation data
predictions = clf.predict(xvalid_ctv)

# calculating accuracy 
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the SVC classifer
filename = 'linear_svc.sav'
joblib.dump(clf, filename)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.63 µs
accuracy 0.5886841254445522


['linear_svc.sav']

### SVC with Count vectors

In [33]:
# We are Linear Support Vector Classifcation(LinearSVC) in it instead of SVM because SVM is very slow still.
clf = SVC(max_iter=1000)
# fitting the data
clf.fit(xtrain_ctv, ytrain)

# predicting on validation data
predictions = clf.predict(xvalid_ctv)

# calculating accuracy 
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the SVC classifer
filename = 'linear_svc.sav'
joblib.dump(clf, filename)



accuracy 0.31315874555447787


['linear_svc.sav']