# Model training

In [8]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import re
import matplotlib.pyplot as plt 
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import hstack


In [9]:
playstore_df = pd.read_csv("googleplaystore.csv")


In [10]:
reviews_df = pd.read_csv("googleplaystore_user_reviews.csv")

In [11]:
df = pd.merge(reviews_df, playstore_df, on='App', how='inner')

In [12]:
df = df.dropna(subset=['Sentiment', 'Translated_Review'])

In [13]:
df['Rating'] = df['Rating'].fillna(df['Rating'].median())
df['Reviews'] = df['Reviews'].fillna(0)
df['Price'] = df['Price'].fillna(0)

In [14]:
df['Size'] = df['Size'].fillna('0')  # or clean + convert later
df['Installs'] = df['Installs'].fillna('0')
df['Current Ver'] = df['Current Ver'].fillna(df['Current Ver'].mode()[0])
df['Android Ver'] = df['Android Ver'].fillna(df['Android Ver'].mode()[0])

In [15]:
def clean_size(size):
    if pd.isnull(size) or size == 'Varies with device':
        return np.nan
    size = size.strip().upper()
    if size.endswith('M'):
        return float(size.replace('M', ''))
    elif size.endswith('K'):
        return float(size.replace('K', '')) / 1024 
    else:
        return np.nan

df['Size'] = df['Size'].apply(clean_size)

In [16]:
df.drop_duplicates(subset=['Translated_Review'], inplace=True)
df.dropna(subset=['Sentiment', 'Translated_Review', 'Rating', 'Category'], inplace=True)

In [17]:
# Clean Rating
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df['Rating'].fillna(df['Rating'].median(), inplace=True)

In [18]:
# Encode Sentiment
df['Sentiment'] = df['Sentiment'].map({'Negative': 0, 'Neutral': 1, 'Positive': 2})

In [19]:
# Encode Category
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Category_encoded'] = le.fit_transform(df['Category'])

In [20]:
X = df[['Translated_Review', 'Rating', 'Category_encoded']]
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

tfidf = TfidfVectorizer(stop_words='english', max_features=300)
X_train_text = tfidf.fit_transform(X_train['Translated_Review'])
X_test_text = tfidf.transform(X_test['Translated_Review'])


In [21]:
X_train_num = X_train[['Rating', 'Category_encoded']].reset_index(drop=True)
X_test_num = X_test[['Rating', 'Category_encoded']].reset_index(drop=True)

# Final feature matrix
X_train_final = hstack([X_train_text, X_train_num])
X_test_final = hstack([X_test_text, X_test_num])

# Random forest

In [22]:
#  Train the Random Forest Model

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_final, y_train)

In [23]:
# Evaluate the Model

y_pred = rf.predict(X_test_final)

print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))


 Confusion Matrix:
 [[ 658  128  410]
 [  56  545  174]
 [ 155  199 3012]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.55      0.64      1196
           1       0.62      0.70      0.66       775
           2       0.84      0.89      0.87      3366

    accuracy                           0.79      5337
   macro avg       0.74      0.72      0.72      5337
weighted avg       0.79      0.79      0.78      5337



# XGboost Classifier

In [25]:
from xgboost import XGBClassifier

# Train-test split

X = df[['Translated_Review', 'Rating', 'Category_encoded']]
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [26]:
# TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=300)
X_train_text = tfidf.fit_transform(X_train['Translated_Review'])
X_test_text = tfidf.transform(X_test['Translated_Review'])

In [27]:
# Combine with numeric features
X_train_num = X_train[['Rating', 'Category_encoded']].reset_index(drop=True)
X_test_num = X_test[['Rating', 'Category_encoded']].reset_index(drop=True)

In [28]:
X_train_final = hstack([X_train_text, X_train_num])
X_test_final = hstack([X_test_text, X_test_num])

In [29]:
# Train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train_final, y_train)

In [30]:
# Evaluate
y_pred = xgb.predict(X_test_final)
print("\n [XGBoost] Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n [XGBoost] Classification Report:\n", classification_report(y_test, y_pred))


 [XGBoost] Confusion Matrix:
 [[ 681  165  350]
 [  21  645  109]
 [ 162  259 2945]]

 [XGBoost] Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.57      0.66      1196
           1       0.60      0.83      0.70       775
           2       0.87      0.87      0.87      3366

    accuracy                           0.80      5337
   macro avg       0.75      0.76      0.74      5337
weighted avg       0.81      0.80      0.80      5337



# Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression


# Train Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_final, y_train)


In [32]:
# Evaluate
y_pred = lr.predict(X_test_final)
print("\n [Logistic Regression] Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n [Logistic Regression] Classification Report:\n", classification_report(y_test, y_pred))


 [Logistic Regression] Confusion Matrix:
 [[ 711  134  351]
 [  45  561  169]
 [ 160  188 3018]]

 [Logistic Regression] Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.59      0.67      1196
           1       0.64      0.72      0.68       775
           2       0.85      0.90      0.87      3366

    accuracy                           0.80      5337
   macro avg       0.75      0.74      0.74      5337
weighted avg       0.80      0.80      0.80      5337

