In [63]:
!pip install nltk

nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [65]:
# Reading the CSV file
df = pd.read_csv('yt_data.csv')

df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


**Data Information**

In [66]:
df.dtypes

Unnamed: 0,0
Comment,object
Sentiment,object


In [67]:
df.shape

(18408, 2)

**Handling NaN values**

In [68]:
df.isnull().sum()

Unnamed: 0,0
Comment,44
Sentiment,0


In [69]:
# Since NaN values were less in quantity compared to total shape of dataset, removing them would be a good decision
df = df.dropna(subset=['Comment'])

df.isnull().sum()

Unnamed: 0,0
Comment,0
Sentiment,0


**Data Encoding**

In [70]:

label_encoder = LabelEncoder()
df['Sentiment'] = label_encoder.fit_transform(df['Sentiment'])

df.head()

# Negative: 0
# Neutral : 1
# Positive: 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment'] = label_encoder.fit_transform(df['Sentiment'])


Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,1
1,here in nz 50 of retailers don’t even have con...,0
2,i will forever acknowledge this channel with t...,2
3,whenever i go to a place that doesn’t take app...,0
4,apple pay is so convenient secure and easy to ...,2


**Text Analytics: Removal of stop words**

In [71]:
def stopwords_removal(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filter_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filter_words)

In [73]:
df['Comment'] = df['Comment'].apply(stopwords_removal)

df.head()

Unnamed: 0,Comment,Sentiment
0,lets forget apple pay 2014 required brand new ...,1
1,nz 50 retailers don’t even contactless credit ...,0
2,forever acknowledge channel help lessons ideas...,2
3,whenever go place doesn’t take apple pay doesn...,0
4,apple pay convenient secure easy use used kore...,2


**Features selection and Data Partionning**

In [74]:
X = df['Comment']
Y = df['Sentiment']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

**Vectorization of dataset**

In [77]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

**Model Training**

In [78]:
model = LogisticRegression()
model.fit(X_train_tfidf, Y_train)

**Model prediction and Accuracy**

In [80]:
Y_pred = model.predict(X_test_tfidf)

print(f"Accuracy Score : {accuracy_score(Y_test, Y_pred)}")
print(f"Classification Report : {classification_report(Y_test, Y_pred)}")

Accuracy Score : 0.7541519194119248
Classification Report :               precision    recall  f1-score   support

           0       0.62      0.31      0.42       441
           1       0.62      0.57      0.59       912
           2       0.81      0.91      0.86      2320

    accuracy                           0.75      3673
   macro avg       0.68      0.60      0.62      3673
weighted avg       0.74      0.75      0.74      3673



In [82]:
test_comment = ["To the person reading this, take a deep breath, pull your shoulders away from your ears, relax your forehead, pull your tongue off the roof of your mouth, close your eyes and exhale. You&#39;re doing great and you are loved."]
test_comment_tfidf = vectorizer.transform(test_comment)
prediction = model.predict(test_comment_tfidf)
print(f"Predicted Sentiment: {prediction}")

Predicted Sentiment: [2]


In [84]:
with open("logistic_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

# Save the vectorizer
with open("vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)