In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score
from nltk.stem import PorterStemmer
from sklearn.svm import SVC 
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

In [28]:
import numpy as np
import pandas as pd
import cleaning
import augmentation
import back_translation

Read Data

In [4]:
df_read = pd.read_csv("assignment.csv")
# df.isnull().sum().sum()
df_read = df_read.dropna() 
# df['label'].unique()

In [5]:
%load_ext autoreload

In [6]:
%autoreload 2

In [7]:
df = cleaning.clean_data(df_read)
df.head(5)

Unnamed: 0,content,label,name
0,im struggling to understand how i feel about s...,Emotion,im struggling to understand how i feel about s...
1,new modis promise to ban plastic straws this m...,Financial,new modis promise to ban plastic straws this m...
2,survey middleincome americans spending less am...,Financial,survey middleincome americans spending less am...
3,watch wall street ended lower after bank stock...,Financial,watch wall street ended lower after bank stock...
4,someone selling a ton of se puts to open sep a...,Financial,someone selling a ton of se puts to open sep a...


In [8]:
df = df.drop('content',axis=1)

## Back translation

#### English to French to English

In [9]:
obj1 = back_translation.back_tr(df,'Sport').main(50)



In [10]:
obj2 = back_translation.back_tr(df,'Science').main(100)

In [14]:
df_bt       = pd.concat([df,obj1,obj2],ignore_index=True)

In [15]:
df2 = df_bt.copy()

### synonym replacement

In [16]:
#for Science
science_augmentation = augmentation.synonim_aug(df2,multiples=2) 
science_augmentation['label'] = len(science_augmentation)*['Science']
df2 = pd.concat([df2,science_augmentation])

#for Sport
science_augmentation = augmentation.synonim_aug(df2,class_="Sport",multiples=2) 
science_augmentation['label'] = len(science_augmentation)*['Sport']
df2 = pd.concat([df2,science_augmentation])

#stopword removal
from nltk.corpus import stopwords
stop = stopwords.words('english')
df2['name_stopwords'] = df2['name'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [17]:
#stemming
 
# Create a Porter Stemmer instance
porter_stemmer = PorterStemmer()
df2['name_stemming'] = df2['name_stopwords'].apply(lambda x: ' '.join([porter_stemmer.stem(word) for word in x.split() ]))

## Train and Test Split

In [18]:
data = df2[['label','name_stemming']].copy()
data.columns = ['label','name']
X_train,X_test,y_train,y_test = train_test_split(data['name'],data['label'],test_size=0.1,random_state = 34)

In [19]:
X_train_new       = X_train.copy()
y_train_new       = y_train.copy()

## Vectorization

In [20]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english',lowercase=True)

X_train_tfidf = tfidf_vectorizer.fit_transform(list(X_train_new))
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## Model Training

#### SVC

In [21]:
n_estimators = 10
clf = OneVsRestClassifier(BaggingClassifier(SVC(kernel='linear', probability=True, class_weight='balanced'), max_samples=1.0 / n_estimators, n_estimators=n_estimators))

In [22]:
clf.fit(X_train_tfidf, y_train_new)

OneVsRestClassifier(estimator=BaggingClassifier(base_estimator=SVC(class_weight='balanced',
                                                                   kernel='linear',
                                                                   probability=True),
                                                max_samples=0.1))

## Model Evaluation

In [23]:
y_pred = clf.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("model accuracy is ", accuracy)

model accuracy is  0.8741284403669725


In [24]:
## Detailed Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Emotion       0.90      0.89      0.89      1462
   Financial       0.86      0.81      0.83      1209
      Health       0.86      0.90      0.88      1885
    Politics       0.89      0.92      0.91      1929
     Science       0.81      0.76      0.78       744
       Sport       0.91      0.88      0.89       946

    accuracy                           0.87      8175
   macro avg       0.87      0.86      0.86      8175
weighted avg       0.87      0.87      0.87      8175



In [25]:
multilabel_confusion_matrix(y_test, y_pred, labels=["Politics","Health","Emotion","Financial","Sport","Science"])

array([[[6018,  228],
        [ 146, 1783]],

       [[6008,  282],
        [ 186, 1699]],

       [[6571,  142],
        [ 168, 1294]],

       [[6807,  159],
        [ 234,  975]],

       [[7142,   87],
        [ 116,  830]],

       [[7300,  131],
        [ 179,  565]]], dtype=int64)

### Testing of probable classes

In [29]:
y_pred_proba = clf.predict_proba(X_test_tfidf)
top3_indices = y_pred_proba.argsort(axis=1)[:,-3:]
top_classes = clf.classes_[top3_indices]
top3_probs = y_pred_proba[np.arange(y_pred_proba.shape[0])[:,None],top3_indices]


In [30]:
df2_test = df2.loc[X_test.index] 

In [31]:
for i in range(1,10):
    print(f"Statement:{df2_test.iloc[i]['name']}")
    print("Top 3 predicted classes:")
    for j in range(3):
        print(f"{top_classes[i][j]} - Probability: {top3_probs[i][j]}")
        print()

Statement:i looked her deeply in the eyes and expressed to her that i loved her so deeply and that what she perceived as anger was my frustration at feeling inadequate to take care of her
Top 3 predicted classes:
Politics - Probability: 0.0055398076513995115

Health - Probability: 0.03166045648595411

Emotion - Probability: 0.9540365732426701

Statement:there are over 14 million active voters in the statenevada secretary of state says 21 noncitizens could have voted for president in her state
Top 3 predicted classes:
Health - Probability: 0.003761868000795956

Financial - Probability: 0.08586834594149262

Politics - Probability: 0.905642990408064

Statement:italy extends cut on fuel excise duties to aug 21st from aug 2nd
Top 3 predicted classes:
Politics - Probability: 0.0837696385115151

Science - Probability: 0.10423389842426858

Financial - Probability: 0.6777447240136051

Statement:i wept with my grandparents who prayed for me by phone that i would feel gods presence to which i rep

In [36]:
df2_test.iloc[1]

label                                                       Emotion
name              i looked her deeply in the eyes and expressed ...
name_stopwords    looked deeply eyes expressed loved deeply perc...
name_stemming     look deepli eye express love deepli perceiv an...
Name: 15243, dtype: object