In [4]:
'''Libraries for loading,manipulating and plotting data'''
import numpy as np    
import pandas as pd   
import seaborn as sns 
import matplotlib.pyplot as plt 

'''Libraries for Machine Learning'''
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv("preprocessed_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,flair,title,body,num_cmnts,created,score,comments,combined_features,label,tokenized,stopwords_removed,polarity,word_count
0,0,fwjdqr,AskIndia,4 days ago we had pending orders of 100 millio...,we are getting frantic calls from our pharma ...,6,2020-04-08 01:37:04,94,modi has stockholm syndrome to be fair the ev...,days ago we had pending orders of million h...,0,"['days', 'ago', 'we', 'had', 'pending', 'order...",days ago pending orders million hydroxychloroq...,0.146789,453
1,1,fizkkk,AskIndia,randians who were big time users of dating app...,i d my own stint with these apps a couple of m...,19,2020-03-16 00:18:06,20,someone matched with me just to tell me that ...,randians who were big time users of dating ap...,0,"['randians', 'who', 'were', 'big', 'time', 'us...",randians big time users dating apps like tinde...,0.103643,523
2,2,f25vx0,AskIndia,what does r india thinks about the flat earthers,i encountered a foreigner in ig who says round...,31,2020-02-11 22:40:55,4,i haven t found a indian yet who believes ear...,what does r india thinks about the flat earth...,0,"['what', 'does', 'r', 'india', 'thinks', 'abou...",r india thinks flat earthers encountered forei...,-0.01132,370
3,3,dtvliq,AskIndia,people who left their 9 to 5 jobs to pursue a ...,couldn t add askindia flair from the mobile br...,34,2019-11-10 02:27:35,44,an engineer doing advertisement shoots since ...,people who left their to jobs to pursue a c...,0,"['people', 'who', 'left', 'their', 'to', 'jobs...",people left jobs pursue career music art forms...,0.181008,456
4,4,b7pvwt,AskIndia,somebody want to kill my full family what to do,it s now 24hrs but local police station is not...,24,2019-04-01 06:30:35,94,calm down go to the sp office of your town fi...,somebody want to kill my full family what to ...,0,"['somebody', 'want', 'to', 'kill', 'my', 'full...",somebody want kill full family hrs local polic...,0.039844,327


In [3]:
features = df['stopwords_removed']
labels = df['label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state = 42)

Making a Pipeline for the Multinomial Naive Bayes model. Count Vectorizer used to make a matrix of token counts. Then Tf-idf transformer is used to get the relevance of each word in the text

In [8]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)


from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.556060606060606
              precision    recall  f1-score   support

           0       0.32      0.57      0.41        54
           1       1.00      0.14      0.25        70
           2       0.88      0.21      0.34        71
           3       0.79      0.68      0.73        56
           4       0.90      0.63      0.75        60
           5       0.57      0.66      0.61        58
           6       0.41      0.76      0.53        46
           7       0.43      0.76      0.55        58
           8       0.43      0.74      0.54        62
           9       1.00      0.57      0.73        61
          10       0.79      0.58      0.67        64

    accuracy                           0.56       660
   macro avg       0.68      0.57      0.55       660
weighted avg       0.70      0.56      0.55       660



In [9]:
sgd = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
              ])
sgd.fit(X_train, y_train)


from sklearn.metrics import classification_report
y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.7651515151515151
              precision    recall  f1-score   support

           0       0.61      0.43      0.50        54
           1       0.84      0.74      0.79        70
           2       0.83      0.69      0.75        71
           3       0.89      0.86      0.87        56
           4       0.79      0.97      0.87        60
           5       0.71      0.88      0.78        58
           6       0.62      0.72      0.67        46
           7       0.76      0.71      0.73        58
           8       0.71      0.74      0.72        62
           9       0.83      0.89      0.86        61
          10       0.77      0.78      0.78        64

    accuracy                           0.77       660
   macro avg       0.76      0.76      0.76       660
weighted avg       0.77      0.77      0.76       660

