In [46]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score
from nltk.stem import PorterStemmer

In [47]:
import numpy as np
import pandas as pd
import cleaning
import augmentation
import back_translation

Read Data

In [48]:
df_read = pd.read_csv("assignment.csv")
# df.isnull().sum().sum()
df_read = df_read.dropna() 
# df['label'].unique()

In [49]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
%autoreload 2

In [51]:
df = cleaning.clean_data(df_read)
df.head(5)

Unnamed: 0,content,label,name
0,im struggling to understand how i feel about s...,Emotion,im struggling to understand how i feel about s...
1,new modis promise to ban plastic straws this m...,Financial,new modis promise to ban plastic straws this m...
2,survey middleincome americans spending less am...,Financial,survey middleincome americans spending less am...
3,watch wall street ended lower after bank stock...,Financial,watch wall street ended lower after bank stock...
4,someone selling a ton of se puts to open sep a...,Financial,someone selling a ton of se puts to open sep a...


In [52]:
df = df.drop('content',axis=1)

## Back translation

#### English to French to English

In [53]:
obj1 = back_translation.back_tr(df,'Sport').main(50)



In [54]:
obj2 = back_translation.back_tr(df,'Science').main(100)

In [55]:
df_bt       = pd.concat([df,obj1,obj2],ignore_index=True)
df2 = df_bt.copy()

### synonym replacement

In [56]:
#for Science
science_augmentation = augmentation.synonim_aug(df2,multiples=2) 
science_augmentation['label'] = len(science_augmentation)*['Science']
df2 = pd.concat([df2,science_augmentation])

#for Sport
science_augmentation = augmentation.synonim_aug(df2,class_="Sport",multiples=2) 
science_augmentation['label'] = len(science_augmentation)*['Sport']
df2 = pd.concat([df2,science_augmentation])

#stopword removal
from nltk.corpus import stopwords
stop = stopwords.words('english')
df2['name_stopwords'] = df2['name'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [57]:
#stemming

porter_stemmer = PorterStemmer()
df2['name_stemming'] = df2['name_stopwords'].apply(lambda x: ' '.join([porter_stemmer.stem(word) for word in x.split() ]))

## Train and Test Split

In [58]:
data = df2[['label','name_stemming']].copy()
data.columns = ['label','name']
X_train,X_test,y_train,y_test = train_test_split(data['name'],data['label'],test_size=0.2,random_state = 34)

In [59]:
X_train_new       = X_train.copy()
y_train_new       = y_train.copy()

## Vectorization

In [60]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english',lowercase=True)

X_train_tfidf = tfidf_vectorizer.fit_transform(list(X_train_new))
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## Model Training

#### logistic regression

In [61]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
clf = LogisticRegression(max_iter=200,class_weight=dict(zip(np.unique(y_train), class_weights)))
clf.fit(X_train_tfidf, y_train_new)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(class_weight={'Emotion': 0.9315811965811965,
                                 'Financial': 1.1618697367018442,
                                 'Health': 0.7182063784923564,
                                 'Politics': 0.7095104804061971,
                                 'Science': 1.8389573139868398,
                                 'Sport': 1.388294484779009},
                   max_iter=200)

## Model Evaluation

In [62]:
y_pred = clf.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("model accuracy is ", accuracy)

model accuracy is  0.8838532110091744


In [63]:
## Detailed Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Emotion       0.92      0.89      0.90      2941
   Financial       0.84      0.85      0.84      2365
      Health       0.91      0.87      0.89      3791
    Politics       0.92      0.89      0.90      3840
     Science       0.74      0.91      0.82      1477
       Sport       0.91      0.91      0.91      1936

    accuracy                           0.88     16350
   macro avg       0.87      0.89      0.88     16350
weighted avg       0.89      0.88      0.88     16350



In [64]:
multilabel_confusion_matrix(y_test, y_pred, labels=["Politics","Health","Emotion","Financial","Sport","Science"])

array([[[12207,   303],
        [  428,  3412]],

       [[12218,   341],
        [  474,  3317]],

       [[13178,   231],
        [  320,  2621]],

       [[13598,   387],
        [  361,  2004]],

       [[14239,   175],
        [  179,  1757]],

       [[14411,   462],
        [  137,  1340]]], dtype=int64)

### Testing of probable classes

In [65]:
y_pred_proba = clf.predict_proba(X_test_tfidf)
top3_indices = y_pred_proba.argsort(axis=1)[:,-3:]
top_classes = clf.classes_[top3_indices]
top3_probs = y_pred_proba[np.arange(y_pred_proba.shape[0])[:,None],top3_indices]


In [66]:
df2_test = df2.loc[X_test.index] 

In [67]:
for i in range(1,10):
    print(f"Statement:{df2_test.iloc[i]['name']}")
    print("Top 3 predicted classes:")
    for j in range(3):
        print(f"{top_classes[i][j]} - Probability: {top3_probs[i][j]}")
        print()

Statement:i looked her deeply in the eyes and expressed to her that i loved her so deeply and that what she perceived as anger was my frustration at feeling inadequate to take care of her
Top 3 predicted classes:
Politics - Probability: 0.0005878887552105808

Health - Probability: 0.005741466412437277

Emotion - Probability: 0.9923549344538236

Statement:there are over 14 million active voters in the statenevada secretary of state says 21 noncitizens could have voted for president in her state
Top 3 predicted classes:
Sport - Probability: 0.0012677182786973266

Financial - Probability: 0.015711154416931092

Politics - Probability: 0.9813437943899106

Statement:italy extends cut on fuel excise duties to aug 21st from aug 2nd
Top 3 predicted classes:
Sport - Probability: 0.05723622138795872

Science - Probability: 0.06521681179053203

Financial - Probability: 0.7894328043342379

Statement:i wept with my grandparents who prayed for me by phone that i would feel gods presence to which i re

In [71]:
df2_test.iloc[1]

label                                                       Emotion
name              i looked her deeply in the eyes and expressed ...
name_stopwords    looked deeply eyes expressed loved deeply perc...
name_stemming     look deepli eye express love deepli perceiv an...
Name: 15243, dtype: object