___

**Importing Necessary libraries**

In [None]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from sklearn.linear_model import LogisticRegression
from sklearn import model_selection

import warnings

nltk.download('stopwords')
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Importing data**

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(train.shape, test.shape)

(44100, 4) (18900, 3)


In [None]:
train.head()

**Text Preprocessing**

In [None]:
def clean_text(text):
  text = text.lower()                                        # convert text to lower
  text = re.sub(r'<.*?>', '', text)                          # remove html tags
  text = re.sub(r'(https?:\/\/|www\.)\S+', '', text)         # remove urls
  text = re.sub(r'@\S+', '', text)                           # remove '@'
  text = re.sub(r'[\[\]\(\)]', '', text)                     # remove brackets
  return text

In [None]:
def text_preprocessing(text):

  tokenizer = RegexpTokenizer(r'\w+')                       # Using RegexTokenizer
  text = clean_text(text)                                   # clean text 
  text = tokenizer.tokenize(text)                           # Tokenize sentences
  # text = [t for t in text if t not in stopwords.words('english')]
  text = ' '.join(text)                                     

  return text

In [None]:
train['Review'] = train['Review'].apply(lambda x: text_preprocessing(x))
test['Review'] = test['Review'].apply(lambda x: text_preprocessing(x))
train.head()

Unnamed: 0,ID,author,Review,Sentiment
0,39467,rayinstirling,today i m working on my quot quirky q quot cue...,2
1,30154,DirtyRose17,dont ya know people love the human society,1
2,16767,yoliemichelle,ughhh rejected from the 09 mediation program s...,0
3,9334,jayamelwani,im so jealous i want an octo drive,0
4,61178,aliisanoun,i remember all the hype around this movie when...,0


**Bag-of-words**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

train_cv = count_vectorizer.fit_transform(train['Review'])
test_cv = count_vectorizer.transform(test['Review'])

train_tfidf = tfidf_vectorizer.fit_transform(train['Review'])
test_tfidf = tfidf_vectorizer.transform(test['Review'])

**Training model**

In [None]:
# Count-Vectorized data
clf1 = LogisticRegression(max_iter=500)
scores = model_selection.cross_val_score(clf1, train_cv, train["Sentiment"], cv=5, scoring="neg_log_loss")
scores

array([-0.8827589 , -0.85888143, -0.86283002, -0.88487643, -0.89656309])

In [None]:
# Tfidf-Vectorized data
clf2 = LogisticRegression(max_iter=500)
scores = model_selection.cross_val_score(clf2, train_tfidf, train["Sentiment"], cv=5, scoring="neg_log_loss")
scores

array([-0.71870535, -0.70465121, -0.70489987, -0.72135558, -0.71414712])

In [None]:
# Tfidf-vectorized data gave better result. Now training on the whole dataset before inference
clf2.fit(train_tfidf, train["Sentiment"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

**Creating submission file**

In [None]:
preds = clf2.predict_proba(test_tfidf)
df2 = pd.DataFrame(preds).rename(columns = {0:'Negative_0', 1:'Neutral_1', 2:'Positive_2'})
df2.to_csv('my_submission_file.csv', index=False)
pd.read_csv('my_submission_file.csv')

Unnamed: 0,Negative_0,Neutral_1,Positive_2
0,0.179506,0.091303,0.729191
1,0.659003,0.166853,0.174144
2,0.969375,0.000110,0.030515
3,0.529993,0.101162,0.368845
4,0.338943,0.344546,0.316511
...,...,...,...
18895,0.423907,0.020628,0.555465
18896,0.003974,0.010613,0.985412
18897,0.693031,0.178188,0.128781
18898,0.628450,0.187763,0.183787


_______