**CORONAVIRUS TWEETS - SENTIMENT ANALYSIS**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
dataset_train = pd.read_csv('Corona_NLP_train.csv', encoding = "ISO-8859-1")
dataset_test = pd.read_csv('Corona_NLP_test.csv')
dataset_train = dataset_train[['OriginalTweet', 'Sentiment']]
dataset_test = dataset_test[['OriginalTweet', 'Sentiment']]
frames = [dataset_train, dataset_test]
dataset = pd.concat(frames)
dataset.reset_index(drop=True, inplace=True)

In [None]:
dataset.tail()

Unnamed: 0,OriginalTweet,Sentiment
44950,Meanwhile In A Supermarket in Israel -- People...,Positive
44951,Did you panic buy a lot of non-perishable item...,Negative
44952,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
44953,Gov need to do somethings instead of biar je r...,Extremely Negative
44954,I and @ForestandPaper members are committed to...,Extremely Positive


In [None]:
dataset.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [None]:
dataset.shape

(44955, 2)

**Converting string value to int for target variable**

In [None]:
dataset['Sentiment'].unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [None]:
dataset.Sentiment.value_counts()

Positive              12369
Negative              10958
Neutral                8332
Extremely Positive     7223
Extremely Negative     6073
Name: Sentiment, dtype: int64

In [None]:
dataset['Sentiment'] = dataset.Sentiment.str.replace('Extremely Positive', 'Positive')
dataset['Sentiment'] = dataset.Sentiment.str.replace('Extremely Negative', 'Negative')
dataset['Sentiment'].unique()

array(['Neutral', 'Positive', 'Negative'], dtype=object)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['Sentiment'] = le.fit_transform(dataset['Sentiment'])
dataset.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,1
1,advice Talk to your neighbours family to excha...,2
2,Coronavirus Australia: Woolworths to give elde...,2
3,My food stock is not the only one which is emp...,2
4,"Me, ready to go at supermarket during the #COV...",0


In [None]:
dataset['OriginalTweet'] = dataset.OriginalTweet.str.replace(r'(@\w*)', ' ') #Removing tags
dataset['OriginalTweet'] = dataset.OriginalTweet.str.replace(r'http\S+', ' ') #Removing url

In [None]:
dataset.head()

Unnamed: 0,OriginalTweet,Sentiment
0,and and,1
1,advice Talk to your neighbours family to excha...,2
2,Coronavirus Australia: Woolworths to give elde...,2
3,My food stock is not the only one which is emp...,2
4,"Me, ready to go at supermarket during the #COV...",0


**CONVERTING INTO WORDS AND LEMMATIZATION**

In [None]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
len(dataset)

44955

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
corpus = []
lem = WordNetLemmatizer()
for i in range(0, len(dataset)):
  og_tweet = re.sub(r'[^a-zA-Z]', ' ', dataset['OriginalTweet'][i]) #Accepting only letter (a-z and A-Z)
  og_tweet = og_tweet.lower() #Lowering everything because model will assuse "He" and "he" as two separate words
  og_tweet = og_tweet.split()
  og_tweet = [lem.lemmatize(word) for word in og_tweet if not word in stopwords.words('english')]
  og_tweet = ' '.join(og_tweet)
  corpus.append(og_tweet)

In [None]:
corpus[4]

'ready go supermarket covid outbreak paranoid food stock litteraly empty coronavirus serious thing please panic cause shortage coronavirusfrance restezchezvous stayathome confinement'

**TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 20000)
x = tfidf.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.20, random_state = 1212)

**Training Model**

In [None]:
from sklearn.linear_model import LogisticRegression
#lr = LogisticRegression(solver = 'saga', max_iter = 1000, penalty = 'l1', C=1)
lr = LogisticRegression(max_iter=1000)
lr.fit(xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
ypred = lr.predict(xtest)
print(np.concatenate((ypred.reshape(len(ypred),1), ytest.reshape(len(ytest),1)),1))

[[0 0]
 [2 1]
 [2 2]
 ...
 [2 2]
 [0 0]
 [0 0]]


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(ytest, ypred)
print(cm)
accuracy_score(ytest, ypred)

[[2787  194  437]
 [ 312 1053  310]
 [ 350  159 3389]]


0.8040262484706929