# Model training

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import re
import matplotlib.pyplot as plt 
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import hstack


In [2]:
playstore_df = pd.read_csv("googleplaystore.csv")


In [3]:
reviews_df = pd.read_csv("googleplaystore_user_reviews.csv")

In [4]:
df = pd.merge(reviews_df, playstore_df, on='App', how='inner')

In [5]:
df = df.dropna(subset=['Sentiment', 'Translated_Review'])

In [6]:
df['Rating'] = df['Rating'].fillna(df['Rating'].median())
df['Reviews'] = df['Reviews'].fillna(0)
df['Price'] = df['Price'].fillna(0)

In [7]:
df['Size'] = df['Size'].fillna('0')  # or clean + convert later
df['Installs'] = df['Installs'].fillna('0')
df['Current Ver'] = df['Current Ver'].fillna(df['Current Ver'].mode()[0])
df['Android Ver'] = df['Android Ver'].fillna(df['Android Ver'].mode()[0])

In [8]:
def clean_size(size):
    if pd.isnull(size) or size == 'Varies with device':
        return np.nan
    size = size.strip().upper()
    if size.endswith('M'):
        return float(size.replace('M', ''))
    elif size.endswith('K'):
        return float(size.replace('K', '')) / 1024 
    else:
        return np.nan

df['Size'] = df['Size'].apply(clean_size)

In [9]:
df.drop_duplicates(subset=['Translated_Review'], inplace=True)
df.dropna(subset=['Sentiment', 'Translated_Review', 'Rating', 'Category'], inplace=True)

In [10]:
# Clean Rating
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df['Rating'].fillna(df['Rating'].median(), inplace=True)

In [11]:
# Encode Sentiment
df['Sentiment'] = df['Sentiment'].map({'Negative': 0, 'Neutral': 1, 'Positive': 2})

In [12]:
# Encode Category
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Category_encoded'] = le.fit_transform(df['Category'])

In [13]:
X = df[['Translated_Review', 'Rating', 'Category_encoded']]
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

tfidf = TfidfVectorizer(stop_words='english', max_features=300)
X_train_text = tfidf.fit_transform(X_train['Translated_Review'])
X_test_text = tfidf.transform(X_test['Translated_Review'])


In [14]:
X_train_num = X_train[['Rating', 'Category_encoded']].reset_index(drop=True)
X_test_num = X_test[['Rating', 'Category_encoded']].reset_index(drop=True)

# Final feature matrix
X_train_final = hstack([X_train_text, X_train_num])
X_test_final = hstack([X_test_text, X_test_num])

In [15]:
df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Category_encoded
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,2,1.0,0.533333,HEALTH_AND_FITNESS,4.0,2490,3.8,"500,000+",Free,0,Everyone 10+,Health & Fitness,"February 17, 2017",1.9,2.3.3 and up,15
2,10 Best Foods for You,This help eating healthy exercise regular basis,2,0.25,0.288462,HEALTH_AND_FITNESS,4.0,2490,3.8,"500,000+",Free,0,Everyone 10+,Health & Fitness,"February 17, 2017",1.9,2.3.3 and up,15
6,10 Best Foods for You,Works great especially going grocery store,2,0.4,0.875,HEALTH_AND_FITNESS,4.0,2490,3.8,"500,000+",Free,0,Everyone 10+,Health & Fitness,"February 17, 2017",1.9,2.3.3 and up,15
8,10 Best Foods for You,Best idea us,2,1.0,0.3,HEALTH_AND_FITNESS,4.0,2490,3.8,"500,000+",Free,0,Everyone 10+,Health & Fitness,"February 17, 2017",1.9,2.3.3 and up,15
10,10 Best Foods for You,Best way,2,1.0,0.3,HEALTH_AND_FITNESS,4.0,2490,3.8,"500,000+",Free,0,Everyone 10+,Health & Fitness,"February 17, 2017",1.9,2.3.3 and up,15


# Random forest

In [16]:
#  Train the Random Forest Model

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_final, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
# Evaluate the Model

y_pred = rf.predict(X_test_final)

print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))


 Confusion Matrix:
 [[ 658  128  410]
 [  56  545  174]
 [ 155  199 3012]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.55      0.64      1196
           1       0.62      0.70      0.66       775
           2       0.84      0.89      0.87      3366

    accuracy                           0.79      5337
   macro avg       0.74      0.72      0.72      5337
weighted avg       0.79      0.79      0.78      5337



# XGboost Classifier

In [18]:
from xgboost import XGBClassifier

# Train-test split

X = df[['Translated_Review', 'Rating', 'Category_encoded']]
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
# TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=300)
X_train_text = tfidf.fit_transform(X_train['Translated_Review'])
X_test_text = tfidf.transform(X_test['Translated_Review'])

In [20]:
# Combine with numeric features
X_train_num = X_train[['Rating', 'Category_encoded']].reset_index(drop=True)
X_test_num = X_test[['Rating', 'Category_encoded']].reset_index(drop=True)

In [21]:
X_train_final = hstack([X_train_text, X_train_num])
X_test_final = hstack([X_test_text, X_test_num])

In [22]:
# Train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train_final, y_train)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [23]:
# Evaluate
y_pred = xgb.predict(X_test_final)
print("\n [XGBoost] Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n [XGBoost] Classification Report:\n", classification_report(y_test, y_pred))


 [XGBoost] Confusion Matrix:
 [[ 681  165  350]
 [  21  645  109]
 [ 162  259 2945]]

 [XGBoost] Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.57      0.66      1196
           1       0.60      0.83      0.70       775
           2       0.87      0.87      0.87      3366

    accuracy                           0.80      5337
   macro avg       0.75      0.76      0.74      5337
weighted avg       0.81      0.80      0.80      5337



# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression


# Train Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_final, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [25]:
# Evaluate
y_pred = lr.predict(X_test_final)
print("\n [Logistic Regression] Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n [Logistic Regression] Classification Report:\n", classification_report(y_test, y_pred))


 [Logistic Regression] Confusion Matrix:
 [[ 711  134  351]
 [  45  561  169]
 [ 160  188 3018]]

 [Logistic Regression] Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.59      0.67      1196
           1       0.64      0.72      0.68       775
           2       0.85      0.90      0.87      3366

    accuracy                           0.80      5337
   macro avg       0.75      0.74      0.74      5337
weighted avg       0.80      0.80      0.80      5337



In [26]:
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()