In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import pandas as pd
from numpy import random

from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import re

In [33]:
flairs = ["AskIndia", "Coronavirus", "Non-Political", "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy", "Sports", "Food", "AMA"]

In [17]:
data = pd.read_csv('Flair_data.csv')
data.head(2)

Unnamed: 0.1,Unnamed: 0,flair,title,score,id,url,comms_num,created,body,author,comments,over_18
0,0,AskIndia,4 days ago we had pending orders of 100 millio...,97,fwjdqr,https://www.reddit.com/r/india/comments/fwjdqr...,6,1586290000.0,> We are getting frantic calls from our pharma...,india_ko_vanakkam,"Modi has Stockholm syndrome To be fair, the e...",False
1,1,AskIndia,Randians who were big time users of dating app...,19,fizkkk,https://www.reddit.com/r/india/comments/fizkkk...,19,1584298000.0,I'd my own stint with these apps(a couple of m...,__knockknockturnal__,Someone matched with me just to tell me that ...,False


In [18]:
#Flairs of "[R]eddiquette" are less, so removing
data = data[data["flair"] != "[R]eddiquette"]

In [19]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def get_date(created):
    return dt.datetime.fromtimestamp(created)

def string_form(value):
    return str(value)

def clean_text(text):
    text = BeautifulSoup(text, "lxml").text
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [26]:
#Convert the text in string
data['title'] = data['title'].apply(string_form)
data['body'] = data['body'].apply(string_form)
data['comments'] = data['comments'].apply(string_form)

#Clean Text
data['title'] = data['title'].apply(clean_text)
data['body'] = data['body'].apply(clean_text)
data['comments'] = data['comments'].apply(clean_text)

# feature_combine = data["title"] + data["comments"] + data["body"]
feature_combine_title_n_body = data["title"] + data["body"]
# data = data.assign(feature_combine = feature_combine)
data = data.assign(title_n_body = feature_combine_title_n_body)

In [21]:
def logisticreg(X_train, X_test, y_train, y_test):
    
    from sklearn.linear_model import LogisticRegression
    
    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5)),
                 ])
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

In [22]:
def linear_svm(X_train, X_test, y_train, y_test):
    
    from sklearn.linear_model import SGDClassifier
    
    sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                 ])
    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)
        
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

In [23]:
def nb_classifier(X_train, X_test, y_train, y_test):
    from sklearn.naive_bayes import MultinomialNB
    
    nb = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])
    nb.fit(X_train, y_train)

    y_pred = nb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

In [34]:
def train_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
    print("Results of Naive Bayes Classifier")
    nb_classifier(X_train, X_test, y_train, y_test)
    print("Results of Linear Support Vector Machine")
    linear_svm(X_train, X_test, y_train, y_test)
    print("Results of Logistic Regression")
    logisticreg(X_train, X_test, y_train, y_test)

In [36]:
cat = data.flair

V = data.title_n_body
W = data.comments
X = data.title
Y = data.body
Z = data.url

print("Flair Detection using Title as Feature")
train_test(X,cat)
print("\nFlair Detection using Body as Feature")
# train_test(Y,cat)
print("\nFlair Detection using URL as Feature")
# train_test(Z,cat)
print("\nFlair Detection using Comments as Feature")
# train_test(V,cat)

Flair Detection using Title as Feature
Results of Naive Bayes Classifier
accuracy 0.6626506024096386
                    precision    recall  f1-score   support

          AskIndia       0.88      0.97      0.92        60
       Coronavirus       0.53      0.82      0.65        62
     Non-Political       0.52      0.45      0.48        69
         Scheduled       0.66      0.93      0.77        84
       Photography       0.72      0.83      0.77        70
Science/Technology       0.77      0.89      0.83        64
          Politics       0.84      0.66      0.74        71
  Business/Finance       0.49      0.42      0.46        66
    Policy/Economy       0.63      0.56      0.59        84
            Sports       0.58      0.52      0.55        62
              Food       0.67      0.36      0.46        73
               AMA       0.71      0.57      0.63        65

          accuracy                           0.66       830
         macro avg       0.67      0.66      0.65       8

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
