In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

import joblib


In [2]:

# Use the dataset
apps_df = pd.read_csv('Apps.csv')
reviews_df = pd.read_csv('Reviews.csv')
merged_df = pd.merge(apps_df, reviews_df, left_on='appId', right_on='app_Id')


In [3]:

# Preprocess the data
merged_df.drop_duplicates(subset=['content'], inplace=True)
merged_df.dropna(subset=['content'], inplace=True)
X = merged_df['content']
y = merged_df['score_y']


In [4]:

# Transform the textual data into numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')


['tfidf_vectorizer.joblib']

In [5]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [6]:

# Define the hyperparameter grid
param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}


In [7]:

# Create a Decision Tree Classifier
clf = DecisionTreeClassifier()


In [8]:

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)


In [9]:

# Print the best hyperparameters and the corresponding score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Hyperparameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 2}
Best Score: 0.6014951176075849


In [10]:

# Evaluate the model on the test set
clf = grid_search.best_estimator_
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[2276  126  155  137 1480]
 [ 490   37   84   91  415]
 [ 431   67  112  196  780]
 [ 342   59  136  322 1791]
 [ 565   69  177  544 9551]]
              precision    recall  f1-score   support

         1.0       0.55      0.55      0.55      4174
         2.0       0.10      0.03      0.05      1117
         3.0       0.17      0.07      0.10      1586
         4.0       0.25      0.12      0.16      2650
         5.0       0.68      0.88      0.77     10906

    accuracy                           0.60     20433
   macro avg       0.35      0.33      0.33     20433
weighted avg       0.53      0.60      0.55     20433



In [11]:

# Save the trained model
joblib.dump(clf, 'decision_tree_model.joblib')


['decision_tree_model.joblib']