In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import re

import nltk, string
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer, TfidfVectorizer)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from sklearn.model_selection import (train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV)
from sklearn.metrics import (accuracy_score, confusion_matrix)
from sklearn import metrics

In [3]:
data = pd.read_csv('data.csv')
data.drop(["Unnamed: 0"], inplace=True, axis=1)
data.head()

Unnamed: 0,Title,ID,Upvotes,URL,Num_comments,Creation Date,Body,Is_original,Comments,Flair,Comments_new
0,Uber driver scammed me by not picking me up an...,cc6wf1,96,https://www.reddit.com/r/india/comments/cc6wf1...,59,2019-07-12 13:53:45,Yesterday it was raining heavily and needed to...,False,"In your case, the driver might have come near ...",[R]eddiquette,"In your case, the driver might have come near ..."
1,Assembly Elections Results Megathread,a54j07,277,https://www.reddit.com/r/india/comments/a54j07...,856,2018-12-11 14:58:33,"---\n# Rajasthan\n\nTotal Seats: 199, Majority...",False,"If elections are not one-sided, it's always go...",[R]eddiquette,"If elections are not one-sided, it's always go..."
2,Travelled to Kashmir 2 years back. This is the...,ekd70o,19,https://www.youtube.com/watch?v=6WgnfARwOdA,7,2020-01-05 21:46:20,,True,Cringe Good Job brother... It looks good !,Photography,Cringe Good Job brother... It looks good !
3,Reminder to file your Income tax return - 31st...,ct4z8k,59,https://www.reddit.com/r/india/comments/ct4z8k...,21,2019-08-21 03:51:02,"Just filing my tax returns, thought of remindi...",False,"Hey, thanks! Skipped my mind. 16 years of educ...",[R]eddiquette,"Hey, thanks! Skipped my mind. 16 years of educ..."
4,"After eating 900 mice, cat goes on Haj! RJio j...",fgarrj,214,https://www.financialexpress.com/opinion/after...,39,2020-03-10 16:46:22,,False,Jio gives free service for six months\n\nIndia...,Business/Finance,Jio gives free service for six months\n\nIndia...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2894 entries, 0 to 2893
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          2894 non-null   object
 1   ID             2894 non-null   object
 2   Upvotes        2894 non-null   int64 
 3   URL            2894 non-null   object
 4   Num_comments   2894 non-null   int64 
 5   Creation Date  2894 non-null   object
 6   Body           1070 non-null   object
 7   Is_original    2894 non-null   bool  
 8   Comments       2741 non-null   object
 9   Flair          2894 non-null   object
 10  Comments_new   2741 non-null   object
dtypes: bool(1), int64(2), object(8)
memory usage: 229.0+ KB


In [5]:
data["Flair"].unique()

array(['[R]eddiquette', 'Photography', 'Business/Finance',
       'Science/Technology', 'AMA', 'Coronavirus', 'Sports', 'Food',
       'Policy/Economy', 'AskIndia', 'Politics', 'Non-Political'],
      dtype=object)

In [6]:
relevant_features = ["Title", "Body", "Comments", "Flair"]
relevant_data = data[relevant_features].copy()
relevant_data.head()

Unnamed: 0,Title,Body,Comments,Flair
0,Uber driver scammed me by not picking me up an...,Yesterday it was raining heavily and needed to...,"In your case, the driver might have come near ...",[R]eddiquette
1,Assembly Elections Results Megathread,"---\n# Rajasthan\n\nTotal Seats: 199, Majority...","If elections are not one-sided, it's always go...",[R]eddiquette
2,Travelled to Kashmir 2 years back. This is the...,,Cringe Good Job brother... It looks good !,Photography
3,Reminder to file your Income tax return - 31st...,"Just filing my tax returns, thought of remindi...","Hey, thanks! Skipped my mind. 16 years of educ...",[R]eddiquette
4,"After eating 900 mice, cat goes on Haj! RJio j...",,Jio gives free service for six months\n\nIndia...,Business/Finance


In [7]:
relevant_data['id'] = relevant_data['Flair'].factorize()[0]

In [8]:
flair_index, index_flair = {},{}
flair_category = relevant_data[['Flair', 'id']].drop_duplicates().sort_values('id')
dict_flair = flair_category.to_dict('split')

for pair in dict_flair['data']:
    flair_index[pair[0]] = pair[1]
    index_flair[pair[1]] = pair[0]
flair_index

{'[R]eddiquette': 0,
 'Photography': 1,
 'Business/Finance': 2,
 'Science/Technology': 3,
 'AMA': 4,
 'Coronavirus': 5,
 'Sports': 6,
 'Food': 7,
 'Policy/Economy': 8,
 'AskIndia': 9,
 'Politics': 10,
 'Non-Political': 11}

In [9]:
relevant_data["Combine"] = relevant_data["Title"].copy()
for i in range(len(relevant_data)):
    if type(relevant_data.loc[i]['Body']) != float:
        relevant_data["Combine"][i] += ' ' + relevant_data["Body"][i]
    if type(relevant_data.loc[i]['Comments']) != float:
        relevant_data["Combine"][i] += ' ' + relevant_data["Comments"][i]
relevant_data["Combine"].head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_data["Combine"][i] += ' ' + relevant_data["Body"][i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_data["Combine"][i] += ' ' + relevant_data["Comments"][i]


0    Uber driver scammed me by not picking me up an...
1    Assembly Elections Results Megathread ---\n# R...
2    Travelled to Kashmir 2 years back. This is the...
3    Reminder to file your Income tax return - 31st...
4    After eating 900 mice, cat goes on Haj! RJio j...
Name: Combine, dtype: object

In [10]:
nltk.download('stopwords')
STOPWORDS = nltk.corpus.stopwords.words('english')
REPLACE_SPACES = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS = re.compile('[^0-9a-z #+_]')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/arghyadeep99/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def clean_text(text):
    text = text.lower()
    text = REPLACE_SPACES.sub(' ', text) 
    text = BAD_SYMBOLS.sub(' ', text)
    text = text.replace('x', ' ')
    text = text.replace('\n', ' ')
    
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [12]:
relevant_data['Combine'] = relevant_data['Combine'].apply(clean_text)
relevant_data['Combine'] = relevant_data['Combine'].str.replace('\d+', ' ')

In [13]:
relevant_data['Combine'][1002]

'bjp leader arrested raping minor   years mumbai bjp need get house order joke many criminals positions disgusting recent court decisions ridiculous bjp must take responsibility pedophile stop calling leader say another bjp pedophile found party members alright guys hold another rally yeah let reuse posters last rally staged support child rapist removed standard becoming member bjp damn putrichod bjp knack recruiting rapist future cm pm beti bachao jeez many rapists party arre kyun pakda bechare ko sala bina matlab victim ke gharwale darr jayenge ki sala saboot na mitane lage ye pradhan mantri ho sake bacche bachao yojana support march coming       raj thackeray theatening mumbaikars come  nd aug summoned enforcement directorate appear interesting week ahead mumbai bois name please seen news tv regarding surprised either removed beti chudvavv office bearer calling bjp leader misleading'

In [14]:
nltk.download('punkt')
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

#remove punctuation, lowercase, stem
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, 
                             stop_words=STOPWORDS, 
                             sublinear_tf=True, 
                             min_df=5, 
                             norm = 'l2', 
                             encoding='latin-1', 
                             ngram_range=(1, 2))

[nltk_data] Downloading package punkt to
[nltk_data]     /home/arghyadeep99/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
# Extracting the features by fitting the Vectorizer on Combined Data
##labels = relevant_data['id']    # Series containing all the post labels
#print(feat.shape)

relevant_data["norm_combine"] = relevant_data["Combine"].apply(normalize)

In [20]:
feat = vectorizer.fit_transform(relevant_data['Combine']).toarray()



In [21]:
labels = relevant_data['id']
feat.shape

(2894, 32467)

In [22]:
N = 5    # Number of examples to be listed
for f, i in sorted(flair_index.items()):
    chi2_feat = chi2(feat, labels == i)
    indices = np.argsort(chi2_feat[0])
    feat_names = np.array(vectorizer.get_feature_names())[indices]
    unigrams = [w for w in feat_names if len(w.split(' ')) == 1]
    bigrams = [w for w in feat_names if len(w.split(' ')) == 2]
    print("\nFlair '{}':".format(f))
    print("Most correlated unigrams:\n\t. {}".format('\n\t. '.join(unigrams[-N:])))
    print("Most correlated bigrams:\n\t. {}".format('\n\t. '.join(bigrams[-N:])))


Flair 'AMA':
Most correlated unigrams:
	. favorit
	. question
	. hello
	. hi
	. ama
Most correlated bigrams:
	. hi thank
	. ama question
	. answer question
	. ask anyth
	. thank ama

Flair 'AskIndia':
Most correlated unigrams:
	. linkedin
	. famili
	. parent
	. relationship
	. depend
Most correlated bigrams:
	. drive car
	. irrespect age
	. easi get
	. get marri
	. tier colleg

Flair 'Business/Finance':
Most correlated unigrams:
	. payment
	. jio
	. ambani
	. relianc
	. bank
Most correlated bigrams:
	. invest rs
	. vodafon idea
	. anil ambani
	. ye bank
	. jio platform

Flair 'Coronavirus':
Most correlated unigrams:
	. test
	. patient
	. hospit
	. coronaviru
	. covid
Most correlated bigrams:
	. posit covid
	. news news
	. covid case
	. test posit
	. coronaviru case

Flair 'Food':
Most correlated unigrams:
	. biryani
	. chutney
	. chicken
	. delici
	. recip
Most correlated bigrams:
	. look yummi
	. look tasti
	. share recip
	. ice cream
	. look delici

Flair 'Non-Political':
Most corre

In [23]:
X_train, X_test, y_train, y_test = train_test_split(relevant_data['Combine'], relevant_data['Flair'], 
                                                    test_size=0.2, 
                                                    random_state=97)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2315,) (2315,) (579,) (579,)


In [24]:
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(X_train)

# Creating an instance of the TFID transformer
tfidf_trans = TfidfTransformer()
X_train_tfidf = tfidf_trans.fit_transform(X_train_counts)

In [51]:
def naive_bayes(X_train, X_test, y_train, y_test):
    
    nb_fit = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', MultinomialNB()),
                 ])
    nb_fit.fit(X_train, y_train)    # Fitting the data to the trianing data
    
    # Making Predictions on the test data
    y_pred = nb_fit.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))
    return nb_fit
    
def random_forest(X_train, X_test, y_train, y_test):
    
    forest = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', RandomForestClassifier()),
                 ])
    forest.fit(X_train, y_train)    # Fitting the data to the trianing data
    
    # Making Predictions on the test data
    y_pred = forest.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))
    return forest

# Support Vector Machines Classifier 
def svc(X_train, X_test, y_train, y_test):
    
    svc_fit = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', SVC(C=1, gamma=1, kernel='linear')),
                 ])
    svc_fit.fit(X_train, y_train)    # Fitting the data to the trianing data
    
    # Making Predictions on the test data
    y_pred = svc_fit.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))
    return svc_fit

# Logistic Regression Classifier 
def log_reg(X_train, X_test, y_train, y_test):
    
    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', LogisticRegression()),
                 ])
    logreg.fit(X_train, y_train)     # Fitting the data to the trianing data

    # Making Predictions on the test data
    y_pred = logreg.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))
    return logreg

In [49]:
naive_bayes(X_train, X_test, y_train, y_test)
random_forest(X_train, X_test, y_train, y_test)
svc(X_train, X_test, y_train, y_test)
log_reg(X_train, X_test, y_train, y_test)

Model Accuracy: 0.4835924006908463
Model Accuracy: 0.5267702936096719
Model Accuracy: 0.6424870466321243
Model Accuracy: 0.613126079447323


Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('model', LogisticRegression())])

## So, SVM is performing the best. So, I will save this model. 

In [52]:
import joblib

model = svc(X_train, X_test, y_train, y_test)
model.fit(X_train, y_train)
joblib.dump(model, 'final_svm.sav')

Model Accuracy: 0.6424870466321243


['final_svm.sav']

In [27]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!wget https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/optimization.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/run_classifier.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py

--2020-06-09 11:35:56--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.174.240, 2404:6800:4009:801::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.174.240|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2020-06-09 11:38:11 (2.90 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [407727028/407727028]

--2020-06-09 11:38:11--  https://raw.githubusercontent.com/google-research/bert/master/modeling.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 37922 (37K) [text/plain]
Saving to: ‘modeling.py’


2020-06-09 11:38:12 (4.93 MB/s) - ‘modeli

In [35]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
train_InputExamples = X_train.apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                   text_a = x['Combine'], 
                                                                   text_b = None, 
                                                                   label = x['id']))
val_InputExamples = X_test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x['Combine'], 
                                                                   text_b = None, 
                                                                   label = x['id']))

AttributeError: module 'tensorflow._api.v2.train' has no attribute 'Optimizer'

In [None]:
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(BERT_MODEL_HUB)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
        vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
    return bert.tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128

# Convert our train and validation features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

val_features = bert.run_classifier.convert_examples_to_features(val_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)