In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip /content/drive/MyDrive/archive.zip


Archive:  /content/drive/MyDrive/archive.zip
  inflating: Movies_Reviews_modified_version1.csv  

In [3]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/Merged_Movies_Emotions.csv'
data = pd.read_csv(file_path)

print(data.head())

# Display the column names to understand the structure of the data
print(data.columns)


   Index  Rating                                            Reviews  \
0      0     6.4  This film, as well as the reaction to it, is a...   
1      1     7.3  I'm a big fan of all the planet of the apes, a...   
2      2     5.5  Pretty much the worst criticism you can lay on...   
3      3     7.3  Just got out of the Austin premier at SXSW and...   
4      4     7.7  This is a tough one. I liked the concept and t...   

                          Movie_Name                      Genres  \
0                    The Idea of You      Comedy, Drama, Romance   
1  Kingdom of the Planet of the Apes   Action, Adventure, Sci-Fi   
2                          Unfrosted  Biography, Comedy, History   
3                       The Fall Guy       Action, Comedy, Drama   
4                        Challengers       Drama, Romance, Sport   

                                         Description   Emotion  
0  Solène, a 40-year-old single mom, begins an un...       joy  
1  Many years after the reign of C

# **Data Cleaning**

In [4]:
# Check for missing values
print(data.isnull().sum())

# Drop rows with missing values or fill them with appropriate values
data = data.dropna()

# Display the unique emotions to understand the emotion labels
print(data['Emotion'].unique())


Index          0
Rating         0
Reviews        0
Movie_Name     0
Genres         0
Description    0
Emotion        0
dtype: int64
['joy' 'fear' 'surprise' 'sadness' 'anger' 'neutral' 'disgust'
 'anticipation' 'optimism']


# **Feature Engineering**

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine relevant text fields into a single feature
data['text'] = data['Reviews'] + ' ' + data['Description']

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the text data into TF-IDF features
X = tfidf.fit_transform(data['text'])

# Encode the emotions as numerical labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Emotion'])


# **Model Training**

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForest classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy}')


KeyboardInterrupt: 

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Assuming X and y are already defined
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}  # You can adjust the range of C values as needed

# Initialize the Logistic Regression classifier
logistic_model = LogisticRegression()

# Perform grid search with cross-validation
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_C = grid_search.best_params_['C']

# Train the Logistic Regression classifier with the best hyperparameters
logistic_model_best = LogisticRegression(C=best_C)
logistic_model_best.fit(X_train, y_train)

# Predict on the test set
logistic_y_pred = logistic_model_best.predict(X_test)

# Function to print model performance
def print_model_performance(model_name, y_test, y_pred):
    print(f'{model_name} Model Performance:')
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(classification_report(y_test, y_pred))
    print('-'*60)

# Print model performance
print_model_performance('Logistic Regression (Tuned)', y_test, logistic_y_pred)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression (Tuned) Model Performance:
Accuracy: 0.7237205162438808
              precision    recall  f1-score   support

           0       0.70      0.62      0.66      1217
           1       0.81      0.78      0.79      1469
           2       0.85      0.66      0.74       383
           3       0.65      0.64      0.65      1246
           4       0.67      0.71      0.69      1862
           5       0.37      0.13      0.19       199
           6       0.78      0.70      0.74       991
           7       0.73      0.83      0.78      3740
           8       0.81      0.10      0.18       128

    accuracy                           0.72     11235
   macro avg       0.71      0.58      0.60     11235
weighted avg       0.72      0.72      0.72     11235

------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Assuming X and y are already defined
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}  # You can adjust the range of alpha values as needed

# Initialize the Naive Bayes classifier
nb_model = MultinomialNB()

# Perform grid search with cross-validation
grid_search = GridSearchCV(nb_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']

# Train the Naive Bayes classifier with the best hyperparameters
nb_model_best = MultinomialNB(alpha=best_alpha)
nb_model_best.fit(X_train, y_train)

# Predict on the test set
nb_y_pred = nb_model_best.predict(X_test)

# Function to print model performance
def print_model_performance(model_name, y_test, y_pred):
    print(f'{model_name} Model Performance:')
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(classification_report(y_test, y_pred))
    print('-'*60)

# Print model performance
print_model_performance('Naive Bayes (Tuned)', y_test, nb_y_pred)


Naive Bayes (Tuned) Model Performance:
Accuracy: 0.5080551846906987
              precision    recall  f1-score   support

           0       0.78      0.34      0.48      1217
           1       0.81      0.30      0.44      1469
           2       0.77      0.30      0.43       383
           3       0.69      0.26      0.38      1246
           4       0.71      0.32      0.44      1862
           5       0.24      0.14      0.17       199
           6       0.87      0.25      0.39       991
           7       0.43      0.94      0.59      3740
           8       0.78      0.05      0.10       128

    accuracy                           0.51     11235
   macro avg       0.68      0.32      0.38     11235
weighted avg       0.64      0.51      0.47     11235

------------------------------------------------------------


# **Recommendation System**

In [None]:
def recommend_movie(emotion_input, data, model, tfidf, label_encoder):
    # Transform the input emotion to its numerical label
    try:
        emotion_label = label_encoder.transform([emotion_input])[0]
    except ValueError:
        return f"Emotion '{emotion_input}' is not recognized by the label encoder."

    # Filter movies with the given emotion
    recommended_movies = data[data['Emotion'] == emotion_input]['Movie_Name'].unique()

    # Debug: Check the number of movies found for the given emotion
    print(f"Number of movies found for emotion '{emotion_input}': {len(recommended_movies)}")

    if len(recommended_movies) > 0:
        return recommended_movies.tolist()  # Convert numpy array to list for easier handling/display
    else:
        return "No movies found for the given emotion."

# Example usage
emotion_input = 'sadness'
recommendations = recommend_movie(emotion_input, data, model, tfidf, label_encoder)

# Displaying the number of recommendations and the first few as a sample for the user
print(f"Number of recommendations: {len(recommendations)}")
print("Sample recommendations:", recommendations[:len(recommendations)])  # Show the first 100 as a sample


Number of movies found for emotion 'sadness': 2308
Number of recommendations: 2308
Sample recommendations: ['The Fall Guy', 'Challengers', 'The Judge', 'Mother of the Bride', 'Megalopolis', 'Deadpool & Wolverine', 'Dune', 'The Holdovers', 'Despicable Me 4', 'Titanic', 'In the Land of Saints & Sinners', 'Purple Hearts', 'X', 'Once Upon a Time in... Hollywood', 'Force of Nature: The Dry 2', 'Chief of Station', 'All of Us Strangers', 'I Saw the TV Glow', 'Damsel', 'Inception', 'The Talented Mr. Ripley', 'Prom Dates', 'No Hard Feelings', 'The Hunger Games: The Ballad of Songbirds & Snakes', 'Blended', 'Forrest Gump', 'Troy', 'Deadpool', 'Gladiator 2', 'Mean Girls', 'Sixteen Candles', 'Past Lives', 'Star Wars: Episode VI - Return of the Jedi', 'Idiocracy', 'Pearl', 'Van Helsing', 'Glass', 'Good Will Hunting', 'Fifty Shades of Grey', '10 Things I Hate About You', 'Avengers: Endgame', 'Mad Max 2', 'Aliens', 'Gone Girl', 'Saving Private Ryan', 'Culpa mía', 'Requiem for a Dream', 'Guardians of 