In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
from sklearn.ensemble import VotingClassifier
import joblib
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv("../data/data.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df.head()

Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


In [3]:
def apply_rating(num):
    if num >= 4:
        return "1"
    elif num <= 2:
        return "-1"
    else:
        return "0"
df["sentiment"] = df["overall"].apply(apply_rating)
df.head()

Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound,sentiment
0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0,1
1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0,1
2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0,1
3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0,1
4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0,1


In [4]:
def clean_text(text):
    text = text.replace("\n", " ")
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [5]:
df.isna().sum()

reviewerName            1
overall                 0
reviewText              1
reviewTime              0
day_diff                0
helpful_yes             0
helpful_no              0
total_vote              0
score_pos_neg_diff      0
score_average_rating    0
wilson_lower_bound      0
sentiment               0
dtype: int64

In [6]:
df.dropna(inplace=True)
df.isna().sum()

reviewerName            0
overall                 0
reviewText              0
reviewTime              0
day_diff                0
helpful_yes             0
helpful_no              0
total_vote              0
score_pos_neg_diff      0
score_average_rating    0
wilson_lower_bound      0
sentiment               0
dtype: int64

In [7]:
word_vectorizer = TfidfVectorizer(
    preprocessor=clean_text,
    lowercase=True,
    stop_words="english",
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9,
    max_features=30000
)

chat_vactorizer = TfidfVectorizer(
    analyzer="char",
    preprocessor=clean_text,
    lowercase=True,
    ngram_range=(2, 4   ),
    min_df=5,
    max_df=0.9,
    max_features=30000
)

vectorizer = FeatureUnion([
    ("word", word_vectorizer),
    ("char", chat_vactorizer)
])
X = df[["reviewText"]]
y = df["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
X_train_vec = vectorizer.fit_transform(X_train["reviewText"])
X_test_vec = vectorizer.transform(X_test["reviewText"])

In [8]:
X_train_char_vec = chat_vactorizer.fit_transform(X_train["reviewText"])
X_test_char_vec = chat_vactorizer.transform(X_test["reviewText"])

In [9]:
model1 = LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000, random_state=42)
model2 = LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000, random_state=42)
model3 = LogisticRegression(C=10.0, solver='saga', penalty='l1', class_weight='balanced', max_iter=1000, random_state=42)
ensemble = VotingClassifier(
    estimators=[
        ('lr_std', model1),
        ('lr_low_c', model2),
        ('lr_l1', model3)
    ],
    voting='soft'
)

In [10]:
pipe = Pipeline([
    ("vectorizer", vectorizer),
    ("ensemble", ensemble)
])
pipe.fit(X_train["reviewText"], y_train)
y_pred = pipe.predict(X_test["reviewText"])
y_pred_proba = pipe.predict_proba(X_test["reviewText"])

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(
    "ROC AUC Score:",
    roc_auc_score(
        y_test,
        y_pred_proba,
        multi_class="ovr",
        average="macro"
    )
)
print("F1 Score:", f1_score(y_test, y_pred, average="macro"))



              precision    recall  f1-score   support

          -1       0.56      0.69      0.62        65
           0       0.05      0.04      0.04        28
           1       0.96      0.95      0.95       890

    accuracy                           0.91       983
   macro avg       0.52      0.56      0.54       983
weighted avg       0.90      0.91      0.90       983

[[ 45   3  17]
 [  6   1  21]
 [ 30  15 845]]
ROC AUC Score: 0.8766857358857152
F1 Score: 0.5373927456270146


In [11]:
X1 = pd.DataFrame({"reviewText": ["dont buy"]})
pipe.predict(X1["reviewText"])

array(['-1'], dtype=object)

In [12]:
word1 = TfidfVectorizer(
    preprocessor=clean_text,
    lowercase=True,
    ngram_range=(1,2),
    stop_words="english",
    max_features=20000,
    min_df=5,
    max_df=0.9,
    sublinear_tf=True
)


char1 = TfidfVectorizer(
    preprocessor=clean_text,
    analyzer="char",
    ngram_range=(3,5),
    max_features=20000,
    min_df=5,
    max_df=0.9,
    sublinear_tf=True
)
char2 = TfidfVectorizer(
    preprocessor=clean_text,
    analyzer="char",
    ngram_range=(2,4),
    max_features=15000,
    sublinear_tf=True
)
word2 = TfidfVectorizer(
    preprocessor=clean_text,
    lowercase=True,
    ngram_range=(1,1),
    stop_words="english",
    max_features=15000,
    sublinear_tf=True,
    min_df=5,
    max_df=0.9,
)
pipe1 = Pipeline([
    ("vectorizer", FeatureUnion([("word", word1), ("char", char1)])),
    ("model", LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000, random_state=42))
])
pipe2 = Pipeline([
    ("vectorizer", FeatureUnion([("word", word2), ("char", char1)])),
    ("model", LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000, random_state=42))
])
pipe3 = Pipeline([
    ("vectorizer", char2),
    ("model", LogisticRegression(C=10.0, solver='saga', class_weight='balanced', max_iter=1000, random_state=42))
])
ensemble = VotingClassifier(
    estimators=[
        ('pipe1', pipe1),
        ('pipe2', pipe2),
        ('pipe3', pipe3)
    ],
    voting='soft'
)
ensemble.fit(X_train["reviewText"], y_train)
y_pred = ensemble.predict(X_test["reviewText"])
y_pred_proba = ensemble.predict_proba(X_test["reviewText"])
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(
    "ROC AUC Score:",
    roc_auc_score(
        y_test,
        y_pred_proba,
        multi_class="ovr",
        average="macro"
    )
)
print("F1 Score:", f1_score(y_test, y_pred, average="macro"))



              precision    recall  f1-score   support

          -1       0.64      0.55      0.60        65
           0       0.10      0.36      0.15        28
           1       0.97      0.90      0.93       890

    accuracy                           0.86       983
   macro avg       0.57      0.60      0.56       983
weighted avg       0.93      0.86      0.89       983

[[ 36  19  10]
 [  5  10  13]
 [ 15  75 800]]
ROC AUC Score: 0.88373014109539
F1 Score: 0.56019677751886


In [15]:
joblib.dump(ensemble, "../models/sentimental_model.pkl")

['../models/sentimental_model.pkl']

----

In [None]:
cluster_vectorizer = TfidfVectorizer(
    preprocessor=clean_text,
    lowercase=True,
    stop_words="english",
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9,
    max_features=30000
)
