# Installing librires

In [None]:
!pip install pandas
!pip install numpy
!pip install nltk
!pip install textblob
!pip install pickle
!pip install re


# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import textblob
import pickle
from nltk.corpus import stopwords
from textblob import Word
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


# Dataset

In [40]:
data = pd.read_csv(r'C:\Users\Noor AB\Desktop\Senior\nlp\Datasetes\text_emotion.csv')

#data = data.drop('author', axis=1)

# Dropping rows with other emotion labels
data = data.drop(data[data.sentiment == 'anger'].index)
data = data.drop(data[data.sentiment == 'boredom'].index)
data = data.drop(data[data.sentiment == 'enthusiasm'].index)
data = data.drop(data[data.sentiment == 'empty'].index)
data = data.drop(data[data.sentiment == 'fun'].index)
data = data.drop(data[data.sentiment == 'relief'].index)
data = data.drop(data[data.sentiment == 'surprise'].index)
data = data.drop(data[data.sentiment == 'love'].index)
data = data.drop(data[data.sentiment == 'hate'].index)
data = data.drop(data[data.sentiment == 'neutral'].index)
data = data.drop(data[data.sentiment == 'worry'].index)


# Preprossesing Data

In [41]:
# Making all letters lowercase
data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# Removing Punctuation, Symbols
data['content'] = data['content'].str.replace('[^\w\s]',' ')

# Removing Stop Words using NLTK
stop = stopwords.words('english')
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#Lemmatisation
data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
#Correcting Letter Repetitions

def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

data['content'] = data['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

# Code to find the top 10,000 rarest words appearing in the data
freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]

# Removing all those rarely appearing words from the data
freq = list(freq.index)
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

  data['content'] = data['content'].str.replace('[^\w\s]',' ')


# Saveing preprocessed Dataset

In [42]:
newdata = data.to_csv(r'C:\Users\Noor AB\Desktop\Senior\nlp\Datasetes\newdataset.csv')

# Encoding labels and Splitting data

In [44]:
#Encoding output labels 'sadness' as '1' & 'happiness' as '0'
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.sentiment.values)

# Splitting into training and testing data in 90:20 ratio
X_train, X_test, y_train, y_test = train_test_split(data.content.values, y, stratify=y, random_state=42, test_size=0.2, shuffle=True)


# Extracting Count Vectors And Bulding Trained Model

In [45]:
# Extracting Count Vectors Parameters
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(data['content'])
X_train_count =  count_vect.transform(X_train)
X_test_count =  count_vect.transform(X_test)

## Building models using count vectors feature
# Model 1: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=1000, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_test_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_test))
# lsvm using count vectors accuracy 0.7928709055876686

# Model 2: Logistic Regression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_test_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_test))
# log reg count vectors accuracy 0.7851637764932563

naive bayes count vectors accuracy 0.7749397590361445
lsvm using count vectors accuracy 0.7874698795180722
log reg count vectors accuracy 0.7975903614457831
random forest with count vectors accuracy 0.7759036144578313


# Data For Testing The Model

In [49]:
#Below are 8 random statements. The first 4 depict happiness. The last 4 depict sadness
#happy 0 Sad 1
tweets = pd.DataFrame(['I am very happy today! The atmosphere looks cheerful',
'Things are looking bad. It was such a hard day',
'Success is right around the corner. Lets celebrate this victory',
'Everything is more beautiful when you experience them with a smile!',
'Now this is my worst',
'I am tired, boss. Tired of being on the road, lonely as a sparrow in the rain. I am tired of all the pain I feel',
'This is quite happy',
'His death broke . It was a sad day'])

# Doing some preprocessing on these tweets as done before
tweets[0] = tweets[0].str.replace('[^\w\s]',' ')
from nltk.corpus import stopwords
stop = stopwords.words('english')
tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
from textblob import Word
tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# Extracting Count Vectors feature from our tweets
tweet_count = count_vect.transform(tweets[0])

#Predicting the emotion of the tweet using our already trained linear SVM
tweet_pred = logreg.predict(tweet_count)
print(tweet_pred)


[0 1 0 0 1 1 0 1]


  tweets[0] = tweets[0].str.replace('[^\w\s]',' ')


# Saving Model

In [56]:
pickle_out = open("LogisticRegression.pickle","wb")
pickle.dump(logreg.predict, pickle_out)
pickle_out.close()
pickle_in = open("LogisticRegression.pickle","rb")
logregmodel = pickle.load(pickle_in)

In [57]:
logregmodel(tweet_count)

array([0, 1, 0, 0, 1, 1, 0, 1])

In [59]:
pickle_out = open("LinearSVM.pickle","wb")
pickle.dump(lsvm.predict, pickle_out)
pickle_out.close()
pickle_in = open("LinearSVM.pickle","rb")
lsvmmodel = pickle.load(pickle_in)

In [60]:
lsvmmodel(tweet_count)

array([0, 1, 0, 0, 1, 1, 0, 1])