In [151]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix

In [152]:
# Loading the datasets
df1 = pd.read_excel("datasets/imbd_test_data.xlsx")
df2 = pd.read_excel("datasets/rotten_tomatoes.xlsx")
df3 = pd.read_excel("datasets/synthetic_100.xlsx")

In [153]:
# df1.head()
# df2.head()
# df3.head()

In [154]:
# Renaming some columns
df1 = df1.rename({"movie_name":"title", "plot":"synopsis"}, axis=1)
df2 = df2.rename({"Title":"title", "Genre":"label"}, axis=1)
df3 = df3.rename({"Title":"title", "Genre":"label", "Synopsis":"synopsis"}, axis=1)

# df1 - imbd dataset
# df2 - rotten tomatoes dataset
# df3 - synthetic dataset

In [155]:
def lower_words_synopsis(data_sets):
    new_datasets = []
    for data in data_sets:
        data["synopsis"] = data["synopsis"].str.lower()
        new_datasets.append(data)

    return new_datasets

In [156]:
df1, df2, df3 = lower_words_synopsis(data_sets=[df1, df2, df3])

In [None]:
cvec = CountVectorizer(stop_words="english")

# Function for testing multiple datasets
def data_tester(data_sets, models, report_path, acc=True, creport=True, 
                cfmat=True, tsize=0.20, rstate=45, vec=cvec):
    for ind, data in enumerate(data_sets):
        X, y = data["synopsis"], data["label"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=tsize, 
                                   random_state=rstate)
        print(f"\n\n--------> DATASET NUMBER - {ind + 1}\n\n")
        
        try:
            with open(report_path, "a") as file:
                file.write(f"--------> DATASET NUMBER - {ind + 1}\n\n")
        except Exception as e:
            print(f"Some exception occured: {e}")
            
        for model in models:
            pipe = Pipeline([
                ("cvec", vec), 
                ("model", model)
            ])
            pipe.fit(X_train, y_train)
            preds = pipe.predict(X_test)

            print(f"Model name - {model}")
            if acc == True:
                acc_score = accuracy_score(y_true=y_test, y_pred=preds)
                print(f"Overall Accuracy: {acc_score}")
            if creport == True:
                print("\nClassification report:")
                clf = classification_report(y_true=y_test, y_pred=preds)
                print(clf)
            if cfmat == True:
                conf_mat = confusion_matrix(y_true=y_test, y_pred=preds)
                print("\nConfusion matrix:")
                print(conf_mat)
                
            try:
                with open(report_path, "a") as file:
                    file.write(f"Model name: {model}\n")
                    file.write(clf)
                    file.write("\n")
                    file.write("\nConfusion matrix:\n")
                    file.write(str(conf_mat) + "\n\n")

            except Exception as e:
                print(f"Some exception occured: {e}")
            

In [158]:
data_tester(
    data_sets=[df1, df2, df3],
    models=[
        GradientBoostingClassifier(),
        AdaBoostClassifier(algorithm="SAMME"),
        MultinomialNB(),
        RandomForestClassifier(),
        LogisticRegression(), 
        DecisionTreeClassifier()
    ],
    report_path="./reports/report1.txt"
)



--------> DATASET NUMBER - 1


Model name - GradientBoostingClassifier()
Overall Accuracy: 0.675

Classification report:
              precision    recall  f1-score   support

    romantic       0.68      0.65      0.67        20
    thriller       0.67      0.70      0.68        20

    accuracy                           0.68        40
   macro avg       0.68      0.68      0.67        40
weighted avg       0.68      0.68      0.67        40


Confusion matrix:
[[13  7]
 [ 6 14]]
Model name - AdaBoostClassifier(algorithm='SAMME')
Overall Accuracy: 0.7

Classification report:
              precision    recall  f1-score   support

    romantic       0.72      0.65      0.68        20
    thriller       0.68      0.75      0.71        20

    accuracy                           0.70        40
   macro avg       0.70      0.70      0.70        40
weighted avg       0.70      0.70      0.70        40


Confusion matrix:
[[13  7]
 [ 5 15]]
Model name - MultinomialNB()
Overall Accuracy: 0.7

##### Conclusion: 
The synthetic data (generated by gpt) is of very low quality. It is so low quality data that all the classification algorithms with their default hyper parameters overfits. So we won't be relying on synthetic data at all.

**Let's generate another report.** In this report we will have the first two datasets, from imbd, rotten tomatoes, and third will be imbd and rotten tomatoes combined.

In [159]:
df3 = pd.concat(objs=[df1,df2], axis=0)

In [160]:
data_tester(
    data_sets=[df1, df2, df3],
    models=[
        GradientBoostingClassifier(),
        AdaBoostClassifier(algorithm="SAMME"),
        MultinomialNB(),
        RandomForestClassifier(),
        LogisticRegression(), 
        DecisionTreeClassifier()
    ],
    report_path="./reports/report2.txt"
)



--------> DATASET NUMBER - 1


Model name - GradientBoostingClassifier()
Overall Accuracy: 0.7

Classification report:
              precision    recall  f1-score   support

    romantic       0.72      0.65      0.68        20
    thriller       0.68      0.75      0.71        20

    accuracy                           0.70        40
   macro avg       0.70      0.70      0.70        40
weighted avg       0.70      0.70      0.70        40


Confusion matrix:
[[13  7]
 [ 5 15]]
Model name - AdaBoostClassifier(algorithm='SAMME')
Overall Accuracy: 0.7

Classification report:
              precision    recall  f1-score   support

    romantic       0.72      0.65      0.68        20
    thriller       0.68      0.75      0.71        20

    accuracy                           0.70        40
   macro avg       0.70      0.70      0.70        40
weighted avg       0.70      0.70      0.70        40


Confusion matrix:
[[13  7]
 [ 5 15]]
Model name - MultinomialNB()
Overall Accuracy: 0.775

##### Final Conclusion:
Even after combining both the datasets, the results were not that good. The rotten tomatoes dataset performs well as compared to the combined dataset. This is because the imbd dataset has a lower quality and it's causing a negative impact to the overall combined dataset. 

**So, we will proceed with the rotten tomatoes dataset.**  
Because here we are getting the best results when we use logistic regression algorithm on DATASET NUMBER - 2, that is of 'Rotten Tomatoes'.