# Testing Various Models and Finding the Best Model to Predict and Detect

## Importing required Libraries

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
import pickle
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Defining the Flairs we have taken into consideration

In [5]:
flairs = ["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]

## Storing and Representing the Data Collected 

In [6]:
data = pd.read_csv('../Data/data.csv')

In [7]:
# print((classification_report(y_val, predicted_classes, target_names=target_names, digits = 6,labels=range(len(self.category_map)))))

In [8]:
data

Unnamed: 0,author,authors,body,comment,comms_num,created,flair,id,score,title,url,combined_features
0,sanand_satwik,hashedram diabapp xataari Aashayrao sarcrasti...,hiits really tough time everyone recently lost...,im freelancer dont listen idiots cant freelanc...,134,1.586742e+09,AskIndia,g014wc,1049,lost job sick mother paralysed dad lockdown ea...,https://www.reddit.com/r/india/comments/g014wc...,lost job sick mother paralysed dad lockdown ea...
1,TWO-WHEELER-MAFIA,Kinky-Monk ak32009 fools_eye None DwncstSheep...,floods terrorist attacks famines due lack rain...,dont understand dont use money contingency fun...,204,1.586448e+09,AskIndia,fxofyu,643,government come begging bowl every crisis,https://www.reddit.com/r/india/comments/fxofyu...,government come begging bowl every crisisdont ...
2,sanand_satwik,AlternativeDrop6 TheRobotsHaveCome lanky32 pl...,hi folks really appreciate warm response previ...,anyone knows influential twitter bangalore ple...,94,1.586871e+09,AskIndia,g0zlly,764,mothers condition going worse due hepatitis b ...,https://www.reddit.com/r/india/comments/g0zlly...,mothers condition going worse due hepatitis b ...
3,GauGau24,Best-Economist Srthak_ ppccbba tb33296 damnji...,dont think weve spend much time family long ti...,yesterday major fight wife mom mominlaw father...,117,1.587384e+09,AskIndia,g4lrhm,158,people stuck family lockdown family falling apart,https://www.reddit.com/r/india/comments/g4lrhm...,people stuck family lockdown family falling ap...
4,indianoogler,RedDevil-84 khushraho kingof-potatos congrats...,corona virus given time think life choices bit...,plan finances work enjoy ways healthy go see w...,206,1.586207e+09,AskIndia,fvy95j,266,men 30+ decided get married plan old age,https://www.reddit.com/r/india/comments/fvy95j...,men 30+ decided get married plan old ageplan f...
5,throaway-forprivacy,brabarusmark HeresyLight jprbruce KyraOfRivia...,sorry reddit post going long please bear also ...,one thing say dont try clear loans taking loan...,83,1.586957e+09,AskIndia,g1lmhg,429,please advice reality punched face today,https://www.reddit.com/r/india/comments/g1lmhg...,please advice reality punched face todayone th...
6,sashankps,Cumberbitch01 nimodbomin satyasys AimHrimKlee...,need hour many citizens contributing pm cares ...,mother works sbi told sbi account maintained s...,77,1.587306e+09,AskIndia,g42vfo,211,pm cares fund used one know track flow transpa...,https://www.reddit.com/r/india/comments/g42vfo...,pm cares fund used one know track flow transpa...
7,cataglottis,_azadak 1209naveen reti_opening kokishinkiba ...,seems like everybody critical direction india ...,money thing would choose india hilltop castle ...,159,1.586812e+09,AskIndia,g0igt7,135,r india money bar would prefer stay india outside,https://www.reddit.com/r/india/comments/g0igt7...,r india money bar would prefer stay india outs...
8,dcm7734,seidenkaufman goodgod23 Batwoman_2017 wisevag...,im dating grad student coimbature age however ...,every human different please take generalized ...,103,1.587474e+09,AskIndia,g590ut,64,im american dating south indian need advice,https://www.reddit.com/r/india/comments/g590ut...,im american dating south indian need adviceeve...
9,maddy2011,Indianopolice JinKazamaWins indiandude007 don...,ill go first ive known experienced barbeque bu...,good mattressgood comfy pair shoesand many mos...,117,1.587658e+09,AskIndia,g6isgb,49,absolutely worth money,https://www.reddit.com/r/india/comments/g6isgb...,absolutely worth moneygood mattressgood comfy ...


In [9]:
data.fillna("",inplace = True)

# Defining the Various Models We Will Train and Test On

## Naive Bayes

In [10]:
def naivebayes_classifier(X_train, X_test, y_train, y_test):
  nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
  nb.fit(X_train, y_train)

  y_pred = nb.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs, labels=["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]))

## Linear SVM

In [None]:
def linear_svm(X_train, X_test, y_train, y_test):
  sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                 ])
  sgd.fit(X_train, y_train)

  y_pred = sgd.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs, labels=["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]))

In [None]:
def sgd(X_train, X_test, y_train, y_test):

    sgd = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                   ])
    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs, labels=["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]))
    #pickle.dump(SGD,open("model_SGC.sav",'wb'))

## Logistic Regression

In [11]:
def logisticreg(X_train, X_test, y_train, y_test):

  from sklearn.linear_model import LogisticRegression

  logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5)),
                 ])
  logreg.fit(X_train, y_train)

  y_pred = logreg.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
#         print((classification_report(y_val, predicted_classes, target_names=target_names, digits = 6,labels=range(len(self.category_map)))))
  print(classification_report(y_test, y_pred,target_names=flairs, labels=["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]))

## Random Forest

In [13]:
def randomforest(X_train, X_test, y_train, y_test):
  ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                 ])
  ranfor.fit(X_train, y_train)

  y_pred = ranfor.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs, labels=["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]))

## Multilayer Perceptron

In [14]:
def mlpclassifier(X_train, X_test, y_train, y_test):  
  mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
  mlp.fit(X_train, y_train)

  y_pred = mlp.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs, labels=["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]))

## Splitting Data Into Training and Testing Set

### Evaluation of Various Models defined above based on various metrics like precision, recall, f1-score, support, and accuracy under each case.

In [16]:
def train_test(X,y):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
   
    print("Results of Naive Bayes Classifier")
    naivebayes_classifier(X_train, X_test, y_train, y_test)
    print("Results of Linear Support Vector Machine")
    linear_svm(X_train, X_test, y_train, y_test)
    print("Results of Logistic Regression")
    logisticreg(X_train, X_test, y_train, y_test)
    print("Results of Random Forest")
    randomforest(X_train, X_test, y_train, y_test)
    print("Results of MLP Classifier")
    mlpclassifier(X_train, X_test, y_train, y_test)
    print("Results of SGD Classifier")
    sgd(X_train, X_test, y_train, y_test)


### Evaluating Measures

In [17]:
data_flair = data.flair
data_combined = data.combined_features
data_comment = data.comment
data_title = data.title
data_body = data.body
data_url = data.url

print("Flair Detection using Title as Feature")
train_test(data_title, data_flair)
print("Flair Detection using Body as Feature")
train_test(data_body, data_flair)
print("Flair Detection using URL as Feature")
train_test(data_url, data_flair)
print("Flair Detection using Comments as Feature")
train_test(data_comment, data_flair)
print("Flair Detection using Combined Features")
train_test(data_combined, data_flair)

Flair Detection using Title as Feature
Results of Naive Bayes Classifier
accuracy 0.8646408839779005
                    precision    recall  f1-score   support

          AskIndia       0.84      0.79      0.82        34
     Non-Political       0.61      0.69      0.65        32
     [R]eddiquette       0.00      0.00      0.00         0
         Scheduled       0.98      1.00      0.99        40
       Photography       0.90      1.00      0.95        43
Science/Technology       0.85      0.78      0.82        51
          Politics       0.84      0.81      0.83        47
  Business/Finance       0.91      0.86      0.88        35
    Policy/Economy       0.83      0.87      0.85        39
            Sports       1.00      0.95      0.97        41
              Food       0.00      0.00      0.00         0
               AMA       0.00      0.00      0.00         0

         micro avg       0.86      0.86      0.86       362
         macro avg       0.65      0.65      0.65       3

accuracy 0.4530386740331492
                    precision    recall  f1-score   support

          AskIndia       0.77      0.88      0.82        34
     Non-Political       1.00      0.09      0.17        32
     [R]eddiquette       0.00      0.00      0.00         0
         Scheduled       1.00      1.00      1.00        40
       Photography       0.19      1.00      0.32        43
Science/Technology       0.95      0.37      0.54        51
          Politics       1.00      0.15      0.26        47
  Business/Finance       1.00      0.31      0.48        35
    Policy/Economy       0.88      0.18      0.30        39
            Sports       1.00      0.10      0.18        41
              Food       0.00      0.00      0.00         0
               AMA       0.00      0.00      0.00         0

         micro avg       0.45      0.45      0.45       362
         macro avg       0.65      0.34      0.34       362
      weighted avg       0.86      0.45      0.45       362

Results o

accuracy 0.8259668508287292
                    precision    recall  f1-score   support

          AskIndia       1.00      0.82      0.90        34
     Non-Political       1.00      0.69      0.81        32
     [R]eddiquette       0.00      0.00      0.00         0
         Scheduled       0.63      1.00      0.78        40
       Photography       0.77      1.00      0.87        43
Science/Technology       0.95      0.71      0.81        51
          Politics       0.69      0.85      0.76        47
  Business/Finance       1.00      0.77      0.87        35
    Policy/Economy       0.91      0.77      0.83        39
            Sports       0.89      0.80      0.85        41
              Food       0.00      0.00      0.00         0
               AMA       0.00      0.00      0.00         0

         micro avg       0.83      0.83      0.83       362
         macro avg       0.65      0.62      0.62       362
      weighted avg       0.86      0.83      0.83       362

Results o

accuracy 0.7707182320441989
                    precision    recall  f1-score   support

          AskIndia       0.36      1.00      0.53        34
     Non-Political       0.88      0.66      0.75        32
     [R]eddiquette       0.00      0.00      0.00         0
         Scheduled       0.95      1.00      0.98        40
       Photography       1.00      0.86      0.92        43
Science/Technology       1.00      0.47      0.64        51
          Politics       0.85      0.87      0.86        47
  Business/Finance       0.82      0.66      0.73        35
    Policy/Economy       0.82      0.69      0.75        39
            Sports       1.00      0.78      0.88        41
              Food       0.00      0.00      0.00         0
               AMA       0.00      0.00      0.00         0

         micro avg       0.77      0.77      0.77       362
         macro avg       0.64      0.58      0.59       362
      weighted avg       0.87      0.77      0.79       362

Results o

In [167]:
X_train, X_test, y_train, y_test = train_test_split(V, cat, test_size=0.2, random_state = 42)
ranfor = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                  ])
RM = ranfor.fit(X_train, y_train)
pickle.dump(RM,open("RM.pkl",'wb'))
y_pred = ranfor.predict(X_test)

## Logistic Regression gives the best model. Hence, we save the model at app/model.

In [168]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg = logreg.fit(X_train, y_train)
pickle.dump(logreg,open("LR.pkl",'wb'))