<a href="https://colab.research.google.com/github/aniruddh996/Roman-urdu-Sentiment-Analyzer/blob/master/roman_urdu_dataset_checkpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.figure_factory as ff

import re

from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, LeaveOneOut, GridSearchCV, RepeatedStratifiedKFold



In [None]:
!pip install --quiet optuna
import optuna

### data information

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#df = pd.read_csv('roman_urdu.csv', header = None)
df = pd.read_csv('roman_urdu.csv', header = None)
df.columns = ['comment','sentiment','nan']
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20229 entries, 0 to 20228
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   comment    20228 non-null  object
 1   sentiment  20229 non-null  object
 2   nan        7 non-null      object
dtypes: object(3)
memory usage: 474.2+ KB


(20229, 3)

### missing values

In [None]:
df.isnull().sum()


comment          1
sentiment        0
nan          20222
dtype: int64

In [None]:
df.drop(columns = ['nan'], inplace = True)
df.head()

Unnamed: 0,comment,sentiment
0,Sai kha ya her kisi kay bus ki bat nhi hai lak...,Positive
1,sahi bt h,Positive
2,"Kya bt hai,",Positive
3,Wah je wah,Positive
4,Are wha kaya bat hai,Positive


In [None]:
df = df.dropna()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20228 entries, 0 to 20228
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   comment    20228 non-null  object
 1   sentiment  20228 non-null  object
dtypes: object(2)
memory usage: 474.1+ KB


In [None]:
df[df['sentiment'] == 'Neative'].index

Int64Index([13277], dtype='int64')

In [None]:
i = df[df['sentiment'] == 'Neative'].index
df.drop(i, inplace = True)
sentiment_size = df['sentiment'].value_counts().reset_index()
sentiment_size.columns = ['sentiment', 'size']
sentiment_size

Unnamed: 0,sentiment,size
0,Neutral,8928
1,Positive,6013
2,Negative,5286


### Sentiment visualization

In [None]:
px.bar(sentiment_size, x = 'sentiment', y = 'size', color = 'size', title = 'sentiment size in the data')

### dropping features from dataset

In [None]:
def drop_features(features, data):
  data.drop(features, inplace = True, axis = 1)

### Preprocessing data

This is a format regular expression that removes signs and unnessacery symbols. This process is to clean the data. Now let's use it in a sample sentence

In [None]:
a = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ","jao bhai! aaj exam hai....")
print(a)

jao bhai  aaj exam hai    


In [None]:
def text_process(text):
  return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",str(text).lower()).split())

df['processed_text'] = df["comment"].apply(text_process)

In [None]:
df.head()

Unnamed: 0,comment,sentiment,processed_text
0,Sai kha ya her kisi kay bus ki bat nhi hai lak...,Positive,sai kha ya her kisi kay bus ki bat nhi hai lak...
1,sahi bt h,Positive,sahi bt h
2,"Kya bt hai,",Positive,kya bt hai
3,Wah je wah,Positive,wah je wah
4,Are wha kaya bat hai,Positive,are wha kaya bat hai


### Word tokenization and Removing Stopwords

In [None]:
stopwords=['ai', 'ayi', 'hy', 'hai', 'main', 'ki', 'tha', 'koi', 'ko', 'sy', 'woh', 'bhi', 'aur', 'wo', 'yeh', 'rha', 'hota', 'ho', 'ga', 'ka', 'le', 'lye', 'kr', 'kar', 'lye', 'liye', 'hotay', 'waisay', 'gya', 'gaya', 'kch', 'ab', 'thy', 'thay', 'houn', 'hain', 'han', 'to', 'is', 'hi', 'jo', 'kya', 'thi', 'se', 'pe', 'phr', 'wala', 'waisay', 'us', 'na', 'ny', 'hun', 'rha', 'raha', 'ja', 'rahay', 'abi', 'uski', 'ne', 'haan', 'acha', 'nai', 'sent', 'photo', 'you', 'kafi', 'gai', 'rhy', 'kuch', 'jata', 'aye', 'ya', 'dono', 'hoa', 'aese', 'de', 'wohi', 'jati', 'jb', 'krta', 'lg', 'rahi', 'hui', 'karna', 'krna', 'gi', 'hova', 'yehi', 'jana', 'jye', 'chal', 'mil', 'tu', 'hum', 'par', 'hay', 'kis', 'sb', 'gy', 'dain', 'krny', 'tou']

In [None]:
def remove_stopwords(text):
  word_tokens = word_tokenize(text)
  filtered_sentence = [w for w in word_tokens if not w in stopwords]
  return " ".join(filtered_sentence)


In [None]:
df['removed_stopwords'] = df['processed_text'].apply(remove_stopwords)
#dropping the processed text
drop_features(['processed_text'],df)
df.head()

Unnamed: 0,comment,sentiment,removed_stopwords
0,Sai kha ya her kisi kay bus ki bat nhi hai lak...,Positive,sai kha her kisi kay bus bat nhi lakin hal kal...
1,sahi bt h,Positive,sahi bt h
2,"Kya bt hai,",Positive,bt
3,Wah je wah,Positive,wah je wah
4,Are wha kaya bat hai,Positive,are wha kaya bat


### deleting the comment column 

In [None]:
drop_features(['comment'], df)

In [None]:
column_names = ['removed_stopwords', 'sentiment']
df = df.reindex(columns = column_names)

In [None]:
df.head()

Unnamed: 0,removed_stopwords,sentiment
0,sai kha her kisi kay bus bat nhi lakin hal kal...,Positive
1,sahi bt h,Positive
2,bt,Positive
3,wah je wah,Positive
4,are wha kaya bat,Positive


### label encoding target data

In [None]:
y = df.iloc[:,1].values
LE_y = LabelEncoder()
y = LE_y.fit_transform(y)

Now we have both the features and the target values ready for training the model. The label encoder converts the categorical data into numeric data. So according to the above dataframe we have converted positive, negative and neutral to 0,1,2 respectively. 

### splitting the model into training and testing set

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['removed_stopwords'], y, test_size = 0.20, random_state = 0)

### vectorizing both training and testing feature sets


The count vectorizer converts the string data into a sparse matrix that provides the number of occurences of the words. The Tfidf transformer is used to find the term frequency and inverse document frequency in the word counts and provides weightage to certain words based on its uniqueness. This is less biased and can be successfully trained

In [None]:
%%time 
count_vec = CountVectorizer()
tfidf_transform = TfidfTransformer(norm = 'l2', sublinear_tf = True)

# vectorizing and transforming training set 
x_train_count = count_vec.fit_transform(x_train)

# tfidf transformer on count vectorizer
x_train_tfidf = tfidf_transform.fit_transform(x_train_count) 

# vectorizing and transforming test set
x_test_count = count_vec.transform(x_test)
x_test_tfidf = tfidf_transform.transform(x_test_count)

CPU times: user 298 ms, sys: 7.62 ms, total: 306 ms
Wall time: 309 ms


### Training and Prediction



classification models: 

1) Random Forest

2) Logistic Regression

3) SVC

4) passive aggressive classifier 




###Random Forest Score 

In [None]:
%%time
rlf_clf = RandomForestClassifier(n_estimators=150, max_depth = 300,random_state = 0)
rlf_clf.fit(x_train_tfidf, y_train)
rf_prediction = rlf_clf.predict(x_test_tfidf)

CPU times: user 41.8 s, sys: 198 ms, total: 42 s
Wall time: 42 s


In [None]:
accuracy_score(y_test, rf_prediction)

0.6322293623331685

### Logistic Regression Score

In [None]:
%%time
logistic_regression_clf = LogisticRegression(random_state = 0, solver = 'liblinear')
logistic_regression_clf.fit(x_train_tfidf, y_train)
log_predict = logistic_regression_clf.predict(x_test_tfidf)


CPU times: user 272 ms, sys: 179 ms, total: 451 ms
Wall time: 236 ms


In [None]:
accuracy_score(y_test, log_predict)

0.6537320810677212

### SVC Score

In [None]:
%%time
svc_clf = svm.SVC(gamma = 'scale', random_state = 0)
svc_clf.fit(x_train_tfidf, y_train)
svc_predict = svc_clf.predict(x_test_tfidf)

CPU times: user 1min 26s, sys: 671 ms, total: 1min 26s
Wall time: 1min 26s


In [None]:
accuracy_score(y_test, svc_predict)

0.6611468116658428

In [None]:
svc_predict

array([1, 0, 1, ..., 1, 1, 1])

### Passive Aggressive Classifier

In [None]:
%%time
pac_clf = PassiveAggressiveClassifier(random_state = 0)
pac_clf.fit(x_train_tfidf, y_train)
pac_predict = pac_clf.predict(x_test_tfidf)

CPU times: user 308 ms, sys: 213 ms, total: 521 ms
Wall time: 275 ms


In [None]:
accuracy_score(y_test, pac_predict)

0.5996045477014336

### Performace Evaluation -1


The performance scores for all the four models shows that the support vector classification model is higher than the rest of the other algorithms. However the time that svc takes for the cell to run is longer even though the score is higher than the rest. We can also see that the difference between the algorithms is by 1% so there isn't much difference in performance.  

However it is important to tune the parameters of the models for a better evaluation as these results are low and cannot be used for further use. Here, we will be using Optuna, a hyperparameter optimization framework.


#### parameter tuning and cross valiation for logistic Regression and Random Forest classifier

In [None]:
import sklearn.svm

def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'LogisticRegression'])
    
    if classifier == 'RandomForest':
      n_estimators = trial.suggest_int('n_estimators', 2, 200)
      max_depth = int(trial.suggest_loguniform('max_depth', 1, 300))

      clf = sklearn.ensemble.RandomForestClassifier(
      n_estimators=n_estimators, max_depth=max_depth)
    else:
      #multi_class = trial.suggest_categorical('multi_class', ['auto', 'ovr', 'multinomial'])
      solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
      clf = LogisticRegression(solver = solver)


    return sklearn.model_selection.cross_val_score(
        clf,x_train_tfidf, y_train, n_jobs=-1, cv=3).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2020-06-28 05:24:39,960] Finished trial#0 with value: 0.5979852463046493 with parameters: {'classifier': 'RandomForest', 'n_estimators': 116, 'max_depth': 134.93531515423533}. Best is trial#0 with value: 0.5979852463046493.
[I 2020-06-28 05:24:40,488] Finished trial#1 with value: 0.6200485035291702 with parameters: {'classifier': 'LogisticRegression', 'solver': 'liblinear'}. Best is trial#1 with value: 0.6200485035291702.
[I 2020-06-28 05:24:41,275] Finished trial#2 with value: 0.6265991979834976 with parameters: {'classifier': 'LogisticRegression', 'solver': 'saga'}. Best is trial#2 with value: 0.6265991979834976.
[I 2020-06-28 05:25:14,832] Finished trial#3 with value: 0.5608427849144042 with parameters: {'classifier': 'RandomForest', 'n_estimators': 121, 'max_depth': 58.4954666593498}. Best is trial#2 with value: 0.6265991979834976.
[I 2020-06-28 05:25:15,581] Finished trial#4 with value: 0.5071991224061421 with parameters: {'classifier': 'RandomForest', 'n_estimators': 2, 'max_d

Accuracy: 0.6267228035591713
Best hyperparameters: {'classifier': 'LogisticRegression', 'solver': 'saga'}


In [None]:
def objective(trial):
    n_iter_no_change = trial.suggest_int('n_iter_no_change', 1, 50)
    clf = PassiveAggressiveClassifier(max_iter = 100, fit_intercept=True, n_iter_no_change=n_iter_no_change)
    clf.fit(x_train_tfidf, y_train)
    return clf.score(x_test_tfidf, y_test)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial 


print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2020-06-28 06:18:19,846] Finished trial#0 with value: 0.5934256055363322 with parameters: {'n_iter_no_change': 18}. Best is trial#0 with value: 0.5934256055363322.
[I 2020-06-28 06:18:20,436] Finished trial#1 with value: 0.5954028670291646 with parameters: {'n_iter_no_change': 33}. Best is trial#1 with value: 0.5954028670291646.

Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.

[I 2020-06-28 06:18:21,078] Finished trial#2 with value: 0.5941670785961444 with parameters: {'n_iter_no_change': 48}. Best is trial#1 with value: 0.5954028670291646.
[I 2020-06-28 06:18:21,588] Finished trial#3 with value: 0.5946613939693525 with parameters: {'n_iter_no_change': 16}. Best is trial#1 with value: 0.5954028670291646.
[I 2020-06-28 06:18:22,247] Finished trial#4 with value: 0.5909540286702917 with parameters: {'n_iter_no_change': 46}. Best is trial#1 with value: 0.5954028670291646.
[I 2020-06-28 06:18:22,779] Finished trial#5 with value: 0

Accuracy: 0.6080079090459714
Best hyperparameters: {'n_iter_no_change': 1}


Unfortunately the SVC would not be further fine tuned due to its long time execution. The code below, is my attempt on tuning the SVC parameters(took me around 2-3 hrs for that one cell to execute).  

In [None]:
def objective(trial):
   c = trial.suggest_loguniform('svc_c', 1e-10, 1e10)
   gamma = trial.suggest_categorical('gamma',['auto','scale'] )
        
   clf = sklearn.svm.SVC(C=c, gamma=gamma)
    
   clf = svm.SVC()
   clf.fit(x_train_tfidf, y_train)
   return clf.score(x_test_tfidf, y_test)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial 


print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2020-06-28 06:58:07,340] Finished trial#0 with value: 0.6611468116658428 with parameters: {'svc_c': 10429215.975263635, 'gamma': 'scale'}. Best is trial#0 with value: 0.6611468116658428.
[I 2020-06-28 06:59:35,272] Finished trial#1 with value: 0.6611468116658428 with parameters: {'svc_c': 1.5098581009325772e-07, 'gamma': 'auto'}. Best is trial#0 with value: 0.6611468116658428.


### Performance scores after hyperparameter tuning

### Random forest 

In [None]:
%%time
rlf_clf = RandomForestClassifier(random_state = 0)
rlf_clf.fit(x_train_tfidf, y_train)
rf_prediction = rlf_clf.predict(x_test_tfidf)


CPU times: user 41.7 s, sys: 17.6 ms, total: 41.7 s
Wall time: 41.7 s


In [None]:
accuracy_score(y_test, rf_prediction)

0.6408798813643104

### Logistic Regression

In [None]:
%%time
logistic_regression_clf = LogisticRegression(random_state = 0, max_iter = 1000, solver = 'liblinear')
logistic_regression_clf.fit(x_train_tfidf, y_train)
log_predict = logistic_regression_clf.predict(x_test_tfidf)


CPU times: user 278 ms, sys: 186 ms, total: 463 ms
Wall time: 252 ms


In [None]:
accuracy_score(y_test, log_predict)

0.6537320810677212

### SVC score

In [None]:
%%time
svc_clf = svm.SVC(gamma = 'scale', random_state = 0, C = 10429215.975263635)
svc_clf.fit(x_train_tfidf, y_train)
svc_predict = svc_clf.predict(x_test_tfidf)



CPU times: user 2min 19s, sys: 80 ms, total: 2min 19s
Wall time: 2min 19s


In [None]:
accuracy_score(y_test, svc_predict)

0.6727632229362334

### Passive Aggressive Classifier score

In [None]:
%%time
pac_clf = PassiveAggressiveClassifier(random_state = 0, fit_intercept=True, n_iter_no_change= 1)
pac_clf.fit(x_train_tfidf, y_train)
pac_predict = pac_clf.predict(x_test_tfidf)


CPU times: user 195 ms, sys: 123 ms, total: 318 ms
Wall time: 198 ms


In [None]:
accuracy_score(y_test, pac_predict)

0.606030647553139

### Performance Evaluation -2

From the above Parameter tuned classifiers, we can conclude that there isn't much of a difference in accuracy. All of the classifiers maintained their score around 60-70% before and after tuning. Some classifiers were better without any parameter improvizations. But its essential to compare between both Logistic Regression and support vector classifier in terms of their relationship between Actual values and predicted values as both of them are close in accuracy score but their speed has a huge difference.

### Relationship between the actual and predicted target values

In [None]:
df1 = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': svc_predict.flatten()})
df2 = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': log_predict.flatten()})
df1 = df1.head(200)
df2 = df2.head(200)

In [None]:
px.bar(df1, x = 'Actual', y= 'Predicted', color = 'Predicted', title = 'SVC actual and predicted value relationship')

In [None]:
px.bar(df2, x = 'Actual', y= 'Predicted', color = 'Predicted', title = 'Logistic Regression actual and predicted value relationship')

In [None]:
z = confusion_matrix(y_test, log_predict)

x = ['positive','neutral','negative']
y =  ['positive','neutral','negative']

# change each element of z to type string for annotations
z_text = [[str(y) for y in x] for x in z]

# figure 
fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

# title
fig.update_layout(title_text='<i><b>Confusion Matrix for Logistic Regression</b></i>',
                  #xaxis = dict(title='x'),
                  #yaxis = dict(title='x')
                 )

# add custom xaxis title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=0.5,
                        y=-0.15,
                        showarrow=False,
                        text="Predicted value",
                        xref="paper",
                        yref="paper"))

# add custom yaxis title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=-0.35,
                        y=0.5,
                        showarrow=False,
                        text="Real value",
                        textangle=-90,
                        xref="paper",
                        yref="paper"))

# adjust margins to make room for yaxis title
fig.update_layout(margin=dict(t=50, l=200))

# add colorbar
fig['data'][0]['showscale'] = True
fig.show()

In [None]:
z = confusion_matrix(y_test, svc_predict)

x = ['positive','neutral','negative']
y =  ['positive','neutral','negative']

# change each element of z to type string for annotations
z_text = [[str(y) for y in x] for x in z]

# figure 
fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

# title
fig.update_layout(title_text='<i><b>Confusion Matrix for SVC</b></i>',
                  #xaxis = dict(title='x'),
                  #yaxis = dict(title='x')
                 )

# add custom xaxis title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=0.5,
                        y=-0.15,
                        showarrow=False,
                        text="Predicted value",
                        xref="paper",
                        yref="paper"))

# add custom yaxis title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=-0.35,
                        y=0.5,
                        showarrow=False,
                        text="Real value",
                        textangle=-90,
                        xref="paper",
                        yref="paper"))

# adjust margins to make room for yaxis title
fig.update_layout(margin=dict(t=50, l=200))

# add colorbar
fig['data'][0]['showscale'] = True
fig.show()

### Finalization

By comparing and evaluating both the confusion matices and their differences, It's safe to say that both the values are not that different in terms of its actual and predicted values. Therefore the model that is best fit for this scenario is Support Vector Classifier.