In [86]:
import numpy as np
import pandas as pd
import pickle


In [87]:
df=pd.read_csv('/content/sample_data/a1_RestaurantReviews_HistoricDump.tsv',delimiter='\t', quoting = 3)

In [88]:
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
895,I want to first say our server was great and w...,1
896,The pizza selections are good.,1
897,"I had strawberry tea, which was good.",1
898,Highly unprofessional and rude to a loyal patron!,0


In [89]:
df.shape

(900, 2)

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  900 non-null    object
 1   Liked   900 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 14.2+ KB


In [91]:
df.columns

Index(['Review', 'Liked'], dtype='object')

In [92]:
df.head(5)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [17]:
df.tail(5)

Unnamed: 0,Review,Liked
895,I want to first say our server was great and w...,1
896,The pizza selections are good.,1
897,"I had strawberry tea, which was good.",1
898,Highly unprofessional and rude to a loyal patron!,0
899,"Overall, a great experience.",1



Data Preprocessing

In [93]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
all_stopwords=stopwords.words('english')
all_stopwords.remove('not')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [94]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')  # Ensure you have punkt tokenizer downloaded
nltk.download('wordnet')  # Ensure you have WordNet downloaded

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [95]:
lemmatizer = WordNetLemmatizer()

In [97]:
corpus = []
for i in range(0, 900):
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)


In [52]:
with open('/content/corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)


In [98]:
corpus[:10]

['wow loved place',
 'crust not good',
 'not tasty texture nasty',
 'stopped late may bank holiday rick steve recommendation loved',
 'selection menu great price',
 'getting angry want damn pho',
 'honeslty taste fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fry great',
 'great touch']

Data transformation

In [99]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1420)


In [101]:
cv

In [61]:
import pickle
bow_path = 'Countvectorizer.pkl'
pickle.dump(cv, open(bow_path, "wb"))

In [102]:
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values


In [103]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state= 0)


In [104]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((720, 1420), (180, 1420), (720,), (180,))

Model fitting (Naive Bayes)

In [42]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)


In [43]:
import joblib
joblib.dump(classifier,'sentiment_model')

['sentiment_model']

In [105]:
# model performance
y_pred = classifier.predict(x_test)


In [106]:
y_pred

array([0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1])

In [107]:
len(y_pred)

180

In [46]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test,y_pred)
acc=accuracy_score(y_test,y_pred)
print(cm)
print(acc)

[[54 24]
 [26 76]]
0.7222222222222222


In [None]:
# write it in using standard practices

In [132]:
import numpy as np
import pandas as pd
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
import joblib

In [133]:
def load_data(file_path):
    """Load data from CSV file"""
    df = pd.read_csv(file_path, delimiter='\t', quoting=3)
    return df

In [134]:
def preprocess_text(text):
    """Preprocess text data"""
    lemmatizer = WordNetLemmatizer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')

    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(all_stopwords)]
    review = ' '.join(review)

    return review

In [135]:
def preprocess_corpus(df):
    """Preprocess text corpus"""
    corpus = [preprocess_text(text) for text in df['Review'][:900]]
    with open('corpus.pkl', 'wb') as f:
        pickle.dump(corpus, f)
    return corpus

In [136]:
def transform_data(corpus):
    """Transform text data into numerical features"""
    cv = CountVectorizer(max_features=1420)
    X = cv.fit_transform(corpus).toarray()
    return X

In [137]:
def train_model(X, y):
    """Train Naive Bayes classifier"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    return classifier, X_test, y_test

In [138]:
def evaluate_model(classifier, X_test, y_test):
    """Evaluate model performance"""
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)
    print("Accuracy:", acc)
    return acc

In [139]:
def save_model(classifier, file_path):
    """Save trained model"""
    joblib.dump(classifier, file_path)

In [140]:
# Load data
file_path = '/content/sample_data/a1_RestaurantReviews_HistoricDump.tsv'
df = load_data(file_path)

In [141]:
# Preprocess corpus
corpus = preprocess_corpus(df)

In [142]:
# Transform data
X = transform_data(corpus)
y = df.iloc[:, -1].values

In [143]:
# Train model
classifier, X_test, y_test = train_model(X, y)

In [144]:
# Evaluate model
accuracy = evaluate_model(classifier, X_test, y_test)



Confusion Matrix:
[[54 24]
 [26 76]]
Accuracy: 0.7222222222222222


In [145]:
# Save model
save_model(classifier, 'sentiment_model.pkl')