In [1]:
# importing different libraries
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
import xgboost as xgb
from collections import Counter
from imblearn.over_sampling import SMOTE

from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stop_words = stopwords.words('english')

Using TensorFlow backend.


In [2]:
# loading our dataset
train_df = pd.read_csv('../input/midas-task/reddit_data.csv')

# dropping rows having null values
train_df.dropna(inplace=True)

# creating a label column to encode our text labels to no.
le = preprocessing.LabelEncoder()
le.fit(train_df["flair"])
train_df["label"] = le.transform(train_df["flair"])
train_df.head()

Unnamed: 0,text,flair,dirty_text,label
0,top comments toi article drop us oil prices,Non-Political,Top comments on a TOI article about the drop i...,3
1,disappointed,Politics,Disappointed,5
2,hacking networking security 2 books 1 hacking ...,Non-Political,Hacking: Networking and Security (2 Books in 1...,3
3,zakir khan irfan junejo live instagram session...,Non-Political,Zakir Khan and Irfan Junejo live Instagram Ses...,3
4,cursing quentin tarantino movie,Non-Political,Cursing In A Quentin Tarantino Movie,3


> We have two columns of texts "text" and "dirty_text" which we can use for training. The type of vectorizer       and tokenizer which we will be using further for modelling and creating pipeline will not need cleaned text. The text will be cleaned during vectorization itself, so we will be using "dirty_text" column for training instead of "text" beacuse it is previously cleaned.
    
> Note: Actually, I have tried training with both "dirty_text" and "text" both to validate my above assumption. The results will be shown further.

In [3]:
# Splitting the data and using "dirty_text" for training
xtrain, xvalid, ytrain, yvalid = train_test_split(train_df.dirty_text, train_df.label, 
                                                  stratify=train_df.label, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [4]:
print(xtrain.shape)
print(xvalid.shape)

(61856,)
(15465,)


# Strategy for Modelling
We will be using two types of vectorizer:
1. TfidfVectorizer
2. CountVectorizer
We will be using our own tokenizer from NLTK.
    
   ## Models
   1. Logistic Regression
   2. Multinomial Naive Bayes
   3. Support Vector Machines
   4. XGBoost
    


In [5]:
# function for tokenization, we are also using stemming to reduce no. of unique tokens
# And we are using Porter Stemmer from NLTK for stemming
# def wordtokenize(text):
#     tokens = word_tokenize(text)
#     stems = []
#     for item in tokens:
#         stems.append(PorterStemmer().stem(item))
#     return stems

In [6]:
# Initialising the TF-IDF Vectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None,
                    strip_accents='unicode', analyzer='word',
                    ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)
# We have not used stopwords argument to remove stopwords in tfidf because the text which we are using are title
# of posts and because of this the no. of words are itself less so, there is no point in reducing no. of words further. 

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))

# Now transforming to TF-IDF Vectors
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

# saving the trained vectorizer model
filename = 'tfidf_vectors.sav'
joblib.dump(tfv, filename)

['tfidf_vectors.sav']

 I tried to use count vectorizer with tokenizer first and it was giving a 1% increase in accuracies but during deployment when running the application with gunicorn it is creating a problem in serialization og pickle file of count vectors. I have tried all possible ways, but the problem is unsolved for me. I have tried to pickle the functon alone but this way the problem is not solved. I have also tried to save the pickle file of vectorizer with the tokenizer function but still due to gunicorn, it's throwing error. 

In [7]:
# Initialising the Count Vectorizer
# ctv = CountVectorizer(analyzer='word', tokenizer = wordtokenize, ngram_range=(1, 3))
ctv = CountVectorizer(analyzer='word', ngram_range=(1, 3))

# Same as TF-IDF Vectorizer, here also we have not removed stopwords

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))

#Now transforming to Count Vectors
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)


# saving the trained vectorizer model
filename = 'count_vectors.sav'
joblib.dump(ctv, filename)

['count_vectors.sav']

##  Logistic Regression
### Logistic Regression with TF-IDF Vectors 

In [8]:
# Initialising Logistic Regression model
clf = LogisticRegression(C=1.0)

# training the model
clf.fit(xtrain_tfv, ytrain)

# predicting with the trained model
predictions = clf.predict(xvalid_tfv)

# calculating accuracy on validation data
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the trained Logistic Regression model
filename = 'lgr_tfidf.sav'
joblib.dump(clf, filename)

accuracy 0.6177174264468154


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


['lgr_tfidf.sav']

### Logistic Regresson with Count Vectors

In [9]:
# Initialising Logistic Regression model
clf = LogisticRegression(C=1.0)

# training the model
clf.fit(xtrain_ctv, ytrain)

# predicting with the trained model
predictions = clf.predict(xvalid_ctv)

# calculating accuracy on validation data
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the trained Logistic Regression model
filename = 'lgr_count.sav'
joblib.dump(clf, filename)

accuracy 0.6219204655674103


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


['lgr_count.sav']

## Naive Bayes
### Naive Bayes with TF-IDF Vectors

In [10]:
# Initialising Logistic Regression model
clf = MultinomialNB()

# training the model
clf.fit(xtrain_tfv, ytrain)

# predicting with the trained model
predictions = clf.predict(xvalid_tfv)

# calculating accuracy on validation data
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the trained Naive Bayes model
filename = 'nvb_tfidf.sav'
joblib.dump(clf, filename)


accuracy 0.5760103459424507


['nvb_tfidf.sav']

### Naive Bayes with Count Vectors

In [11]:
# Initialising Logistic Regression model
clf = MultinomialNB()

# training the model
clf.fit(xtrain_ctv, ytrain)

# predicting with the trained model
predictions = clf.predict(xvalid_ctv)

# calculating accuracy on validation data
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the trained Naive Bayes model
filename = 'nvb_count.sav'
joblib.dump(clf, filename)


accuracy 0.611186550274814


['nvb_count.sav']

## Support Vector Machines(SVM)

SGD Classifiers are a class of linear classifiers like SVMs, Logistic Regression, etc. By default, it implements
SVM classfier with Stochastic Gradient Descent(SGD). The type of classifier can be changed through loss parameter
by default it uses "hinge" loss. In the cell below, I have mentioned only that combination of SVM and vector which worked best.
 
 
I have experimented a lot with SMVs and then came to conclusion to use **SGD Classifier with Count Vectors**.
Please refer to this [notebook](https://www.kaggle.com/adityakumar01/svm-classifier) or the notebook for svm in this repository for all the experiments regarding SVMs with different vectors.

In [12]:
# intialising the SVM classifier
clf = SGDClassifier(alpha=1e-3, random_state=42, max_iter=30)

# fitting the data
clf.fit(xtrain_ctv, ytrain)

# predicting on validation data
predictions = clf.predict(xvalid_ctv)

# calculating accuracies
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the SGD classifer
filename = 'sgd_classifier.sav'
joblib.dump(clf, filename)

accuracy 0.6217264791464597


['sgd_classifier.sav']

## XGBoost
   For XGBoost, I have created a separated notebook where I have experimented with different combinations of XGBoost with different Vectorizer. As XGBoost take a lot of time in training that's why I have not included in this notebook.
   Please refer to this [notebook](https://www.kaggle.com/adityakumar01/xgboost).

## Augmentations

Now from above we know, Logistic Regression with Count Vectors work best. So, for further we are only going to use Lgoistic Regression with Count Vectors.

### Synthetic Minority Oversampling Technique(SMOTE)

In [13]:
# initialising oversampling method SMOTE
oversample = SMOTE()

In [14]:
# checking distribution of labels before oversampling
print(Counter(ytrain), Counter(yvalid))

Counter({5: 18486, 3: 16146, 0: 10491, 2: 6570, 1: 3870, 6: 3149, 4: 3144}) Counter({5: 4622, 3: 4036, 0: 2623, 2: 1643, 1: 968, 6: 787, 4: 786})


In [15]:
# fitting the data into SMOTE for oversampling
xtrain_ctv, ytrain = oversample.fit_resample(xtrain_ctv, ytrain)
xvalid_ctv, yvalid = oversample.fit_resample(xvalid_ctv, yvalid)

In [16]:
# checking the ditribution of data after oversampling
print(Counter(ytrain), Counter(yvalid))

Counter({3: 18486, 5: 18486, 6: 18486, 2: 18486, 4: 18486, 1: 18486, 0: 18486}) Counter({3: 4622, 0: 4622, 5: 4622, 6: 4622, 4: 4622, 1: 4622, 2: 4622})


In [17]:
# Initialising Logistic Regression model
clf = LogisticRegression(C=1.0)

# training the model
clf.fit(xtrain_ctv, ytrain)

# predicting with the trained model
predictions = clf.predict(xvalid_ctv)

# calculating accuracy on validation data
print('accuracy %s' % accuracy_score(yvalid, predictions))

# saving the trained Logistic Regression model
filename = 'lgr_count_oversample.sav'
joblib.dump(clf, filename)

accuracy 0.4546578475613525


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


['lgr_count_oversample.sav']