<a href="https://colab.research.google.com/github/ahmad-smasri/CODSOFT/blob/main/Project_1_Movie_Genre_Classification_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing needed Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Definig the function that reads the text file

In [2]:
def read_data(filename):

  with open(filename, 'r') as file:
    text = file.read()

  paragraphs = text.split('\n')

  data_dict = { "ID": [] ,"Title": [] , "Description": [], "Genre": [] }

  for paragraph in paragraphs:
    word = paragraph.split(" ::: ")
    if len(word) > 3:
      data_dict["ID"].append(word[0])
      data_dict["Title"].append(word[1])
      data_dict["Genre"].append(word[2])
      data_dict["Description"].append(word[3])
    else:
      continue

  data = pd.DataFrame(data_dict)
  return data

### Converting training data into a table

In [3]:
training_file = "/content/drive/MyDrive/Project 1 Dataset/Genre Classification Dataset/train_data.txt"
training_data = read_data(training_file)
training_Data = training_data.drop(["ID", "Title"], axis=1)
training_Data.head()

Unnamed: 0,Description,Genre
0,Listening in to a conversation between his doc...,drama
1,A brother and sister with a past incestuous re...,thriller
2,As the bus empties the students for their fiel...,adult
3,To help their unemployed father make ends meet...,drama
4,The film's title refers not only to the un-rec...,drama


## Converting testing data into a table

In [4]:
testing_file = "/content/drive/MyDrive/Project 1 Dataset/Genre Classification Dataset/test_data_solution.txt"
testing_data = read_data(testing_file)
testing_Data = testing_data.drop(["ID", "Title"], axis=1)
testing_Data.head()

Unnamed: 0,Description,Genre
0,"L.R. Brane loves his life - his car, his apart...",thriller
1,"Spain, March 1964: Quico is a very naughty chi...",comedy
2,One year in the life of Albin and his family o...,documentary
3,"His father has died, he hasn't spoken with his...",drama
4,Before he was known internationally as a marti...,drama


## Preprocessing Training and Testing Data

### Labeling Genres

In [5]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
le = LabelEncoder()

#Training Set
# Fit the encoder to the categorical data
le.fit(training_Data["Genre"])

# Transform the categorical data into numerical labels
encoded_cat = le.transform(training_Data["Genre"])
training_Data["Genre"] = encoded_cat

print(training_Data)

#Testing Set
# Fit the encoder to the categorical data
le.fit(testing_Data["Genre"])

# Transform the categorical data into numerical labels
encoded_cat = le.transform(testing_Data["Genre"])
testing_Data["Genre"] = encoded_cat

print(testing_Data)

                                             Description  Genre
0      Listening in to a conversation between his doc...      8
1      A brother and sister with a past incestuous re...     24
2      As the bus empties the students for their fiel...      1
3      To help their unemployed father make ends meet...      8
4      The film's title refers not only to the un-rec...      8
...                                                  ...    ...
54209  This short-lived NBC live sitcom centered on B...      5
54210  The NEXT Generation of EXPLOITATION. The siste...     13
54211  Ze bestaan echt, is a stand-up comedy about gr...      7
54212  Walter and Vivian live in the country and have...      5
54213  On Labor Day Weekend, 1935, the most intense h...     12

[54214 rows x 2 columns]
                                             Description  Genre
0      L.R. Brane loves his life - his car, his apart...     24
1      Spain, March 1964: Quico is a very naughty chi...      5
2      One yea

### Preprocessing Description

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)

#Training set
training_features = vectorizer.fit_transform(training_Data["Description"])

#Testing set
testing_features = vectorizer.transform(testing_Data["Description"])

## Splitting Data between features and Target

In [7]:
#Training set
X_train = training_features
y_train = training_Data["Genre"]

#Testing set
X_test = testing_features
y_test = testing_Data["Genre"]

# Training and Testing

## Naive Bayes

In [8]:
from sklearn.naive_bayes import MultinomialNB

#Training and fitting
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

#Testing the model
nb_accuracy = accuracy_score(y_test, nb_model.predict(X_test))
nb_precision = precision_score(y_test, nb_model.predict(X_test), average='weighted')
nb_recall = recall_score(y_test, nb_model.predict(X_test), average='weighted')
nb_f1 = f1_score(y_test, nb_model.predict(X_test), average='weighted')

print("Accuracy:", nb_accuracy)
print("Precision:", nb_precision)
print("Recall:", nb_recall)
print("F1 Score:", nb_f1)

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.5092066420664206
Precision: 0.4983971980051346
Recall: 0.5092066420664206
F1 Score: 0.4198005725778971


## Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

#Training and Fitting
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

#Testing Output
lr_accuracy = accuracy_score(y_test, lr_model.predict(X_test))
lr_precision = precision_score(y_test, lr_model.predict(X_test), average='weighted')
lr_recall = recall_score(y_test, lr_model.predict(X_test), average='weighted')
lr_f1 = f1_score(y_test, lr_model.predict(X_test), average='weighted')

print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1 Score:", lr_f1)

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.5946309963099631
Precision: 0.5794043543563216
Recall: 0.5946309963099631
F1 Score: 0.553183131327619


## Support Vector Machines

In [10]:
from sklearn.svm import SVC

#Training Set
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

#Testing Set
svm_accuracy = accuracy_score(y_test, svm_model.predict(X_test))
svm_precision = precision_score(y_test, svm_model.predict(X_test), average='weighted')
svm_recall = recall_score(y_test, svm_model.predict(X_test), average='weighted')
svm_f1 = f1_score(y_test, svm_model.predict(X_test), average='weighted')

print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1 Score:", svm_f1)

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.600110701107011
Precision: 0.5796187862903709
Recall: 0.600110701107011
F1 Score: 0.564902354689855


# Checking the Optimal Model

In [11]:
max_accuracy = max(svm_accuracy, lr_accuracy, nb_accuracy)
max_Precision = max(svm_precision, lr_precision, nb_precision)
max_Recall = max(svm_recall, lr_recall, nb_recall)
max_F1 = max(svm_f1, lr_f1, nb_f1)

if max_accuracy == svm_accuracy:
  print("The optimal model is Support Vector Machines")
elif max_accuracy == lr_accuracy:
  print("The optimal model is Logistic Regression")
else:
  print("The optimal model is Naive Bayes")

if max_Precision == svm_precision:
  print("The optimal model is Support Vector Machines")
elif max_Precision == lr_precision:
  print("The optimal model is Logistic Regression")
else:
  print("The optimal model is Naive Bayes")

if max_Recall == svm_recall:
  print("The optimal model is Support Vector Machines")
elif max_Recall == lr_recall:
  print("The optimal model is Logistic Regression")
else:
  print("The optimal model is Naive Bayes")

if max_F1 == svm_f1:
  print("The optimal model is Support Vector Machines")
elif max_F1 == lr_f1:
  print("The optimal model is Logistic Regression")
else:
  print("The optimal model is Naive Bayes")

The optimal model is Support Vector Machines
The optimal model is Support Vector Machines
The optimal model is Support Vector Machines
The optimal model is Support Vector Machines
