# Integrating Drive with Colab

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
import os

files=os.listdir('/content/')
print(files)

['.config', 'drive', 'sample_data']


In [13]:
import pandas as pd

test_data = pd.read_csv('/content/drive/MyDrive/test_data.txt', delimiter='\t', header=None)

In [14]:
train_data = pd.read_csv('/content/drive/MyDrive/train_data.txt',delimiter='\t',header=None)

In [15]:
train_data.head()

Unnamed: 0,0
0,1 ::: Oscar et la dame rose (2009) ::: drama :...
1,2 ::: Cupid (1997) ::: thriller ::: A brother ...
2,"3 ::: Young, Wild and Wonderful (1980) ::: adu..."
3,4 ::: The Secret Sin (1915) ::: drama ::: To h...
4,5 ::: The Unrecovered (2007) ::: drama ::: The...


In [16]:
train_data.head()

Unnamed: 0,0
0,1 ::: Oscar et la dame rose (2009) ::: drama :...
1,2 ::: Cupid (1997) ::: thriller ::: A brother ...
2,"3 ::: Young, Wild and Wonderful (1980) ::: adu..."
3,4 ::: The Secret Sin (1915) ::: drama ::: To h...
4,5 ::: The Unrecovered (2007) ::: drama ::: The...


<hr><hr>

<h1>Exploring the Data.</h1>

<h1>It, looks like each line contains a unique identifier ?</h1><br><ul>Followed by a <b>movie</b> title, <b>release</b> year, and a <b>plot</b> summary.</ul><br>

<hr>

<h2>1.   Understanding the Structure</h2>



&nbsp;*   Each line appears to have the following structure:

*   <b>ID</b>: A unique identifier.
*   <b>Title and Year</b>: The movie title followed by the release year in parentheses.
*   <b>Genre</b>: The genre(s) of the movie.
*   <b>Plot Summary</b>: A brief description of the movie plot.

<hr>

<h2>2.   Loading and Processing the Data</h2>



In [17]:
with open('/content/drive/MyDrive/train_data.txt', 'r') as file:
    lines = file.readlines()

parsed_data = []
for line in lines:
    parts = line.strip().split(' ::: ')
    if len(parts) == 4:
        movie_id, title, genre, plot_summary = parts
        parsed_data.append([movie_id, title, genre, plot_summary])

# Create DataFrame
df = pd.DataFrame(parsed_data, columns=['ID', 'Title', 'Genre', 'Description'])

# Print the DataFrame
print(df)

          ID                                       Title        Genre  \
0          1                Oscar et la dame rose (2009)        drama   
1          2                                Cupid (1997)     thriller   
2          3            Young, Wild and Wonderful (1980)        adult   
3          4                       The Secret Sin (1915)        drama   
4          5                      The Unrecovered (2007)        drama   
...      ...                                         ...          ...   
54209  54210                             "Bonino" (1953)       comedy   
54210  54211                 Dead Girls Don't Cry (????)       horror   
54211  54212   Ronald Goedemondt: Ze bestaan echt (2008)  documentary   
54212  54213                    Make Your Own Bed (1944)       comedy   
54213  54214  Nature's Fury: Storm of the Century (2006)      history   

                                             Description  
0      Listening in to a conversation between his doc...  
1    

<hr>

<h2>3.   Final DataFrame</h2>



In [18]:
df[['Title', 'Year']] = df['Title'].str.extract(r'(.+) \((\d{4})\)')
df

Unnamed: 0,ID,Title,Genre,Description,Year
0,1,Oscar et la dame rose,drama,Listening in to a conversation between his doc...,2009
1,2,Cupid,thriller,A brother and sister with a past incestuous re...,1997
2,3,"Young, Wild and Wonderful",adult,As the bus empties the students for their fiel...,1980
3,4,The Secret Sin,drama,To help their unemployed father make ends meet...,1915
4,5,The Unrecovered,drama,The film's title refers not only to the un-rec...,2007
...,...,...,...,...,...
54209,54210,"""Bonino""",comedy,This short-lived NBC live sitcom centered on B...,1953
54210,54211,,horror,The NEXT Generation of EXPLOITATION. The siste...,
54211,54212,Ronald Goedemondt: Ze bestaan echt,documentary,"Ze bestaan echt, is a stand-up comedy about gr...",2008
54212,54213,Make Your Own Bed,comedy,Walter and Vivian live in the country and have...,1944


<hr>

<h2>4.   Exploratory Data Analysis (EDA).</h2>



In [19]:
df.head()

Unnamed: 0,ID,Title,Genre,Description,Year
0,1,Oscar et la dame rose,drama,Listening in to a conversation between his doc...,2009
1,2,Cupid,thriller,A brother and sister with a past incestuous re...,1997
2,3,"Young, Wild and Wonderful",adult,As the bus empties the students for their fiel...,1980
3,4,The Secret Sin,drama,To help their unemployed father make ends meet...,1915
4,5,The Unrecovered,drama,The film's title refers not only to the un-rec...,2007


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           54214 non-null  object
 1   Title        49867 non-null  object
 2   Genre        54214 non-null  object
 3   Description  54214 non-null  object
 4   Year         49867 non-null  object
dtypes: object(5)
memory usage: 2.1+ MB


<h2>*   Finding Null Values or Missing Values.</h2>



In [21]:
df.isnull().sum()

Unnamed: 0,0
ID,0
Title,4347
Genre,0
Description,0
Year,4347


<h2>*   Handling Null Values.</h2>



In [24]:
df['Title'] = df['Title'].fillna(df['Title'].ffill(), inplace= False)
df['Year'] = df['Year'].fillna(df['Year'].bfill(), inplace = False)

In [25]:
df.isnull().sum()

Unnamed: 0,0
ID,0
Title,0
Genre,0
Description,0
Year,0


<hr><hr>

# Model Training

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

df['Genre'] = df['Genre'].astype('str')

In [30]:
X_train = df['Description']
y_train = df['Genre']

<hr>

<h2>*   Text Preprocessing and Vectorization.</h2>



In [31]:
tfidf = tfidf(max_features=5000)
x_tfidf = tfidf.fit_transform(X_train)

<hr>

<h2>*   Split the Data.</h2>



In [32]:
X_train, X_test, y_train, y_test = train_test_split(x_tfidf,y_train,test_size=0.2,random_state=42)

<hr>

<h2>*   Train the Model.</h2>



In [34]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<hr>

<h2>*   Evaluate the Model.</h2>



In [37]:
pred = model.predict(X_test)

print(f"Accuracy Score {accuracy_score(y_test,pred)}.")

Accuracy Score 0.5904270035967906.


<hr><hr>

<h2>*   Before, <b>testing</b> the Model. Let's <b>Load</b> and <b>Process</b> the <b>Test</b> Data.</h2>



In [42]:
with open('/content/drive/MyDrive/test_data.txt', 'r') as file:
    lines = file.readlines()

parsed_data = []
for line in lines:
    parts = line.strip().split(' ::: ')
    if len(parts) == 3:
        movie_id, title, plot_summary = parts
        parsed_data.append([movie_id, title, plot_summary])

# Create DataFrame
df = pd.DataFrame(parsed_data, columns=['ID', 'Title', 'Description'])

# Print the DataFrame
print(df)

          ID                           Title  \
0          1            Edgar's Lunch (1998)   
1          2        La guerra de papá (1977)   
2          3     Off the Beaten Track (2010)   
3          4          Meu Amigo Hindu (2015)   
4          5               Er nu zhai (1955)   
...      ...                             ...   
54195  54196  "Tales of Light & Dark" (2013)   
54196  54197     Der letzte Mohikaner (1965)   
54197  54198             Oliver Twink (2007)   
54198  54199               Slipstream (1973)   
54199  54200       Curitiba Zero Grau (2010)   

                                             Description  
0      L.R. Brane loves his life - his car, his apart...  
1      Spain, March 1964: Quico is a very naughty chi...  
2      One year in the life of Albin and his family o...  
3      His father has died, he hasn't spoken with his...  
4      Before he was known internationally as a marti...  
...                                                  ...  
54195  Cov

<hr>

<h2>*   Final DataFrame.</h2>



In [51]:
df

Unnamed: 0,ID,Description,Year
0,1,"L.R. Brane loves his life - his car, his apart...",1998
1,2,"Spain, March 1964: Quico is a very naughty chi...",1977
2,3,One year in the life of Albin and his family o...,2010
3,4,"His father has died, he hasn't spoken with his...",2015
4,5,Before he was known internationally as a marti...,1955
...,...,...,...
54195,54196,"Covering multiple genres, Tales of Light & Dar...",2013
54196,54197,As Alice and Cora Munro attempt to find their ...,1965
54197,54198,"A movie 169 years in the making. Oliver Twist,...",2007
54198,54199,"Popular, but mysterious rock D.J Mike Mallard ...",1973


<hr><hr>

<h1>Testing the Model.</h1>

In [53]:
X_test_tfidf = tfidf.transform(df['Description'])

<hr><hr>

<h1>Predict the Genres</h1>

In [55]:
predict_genres = model.predict(X_test_tfidf)

<hr><hr>

<h1>Adding <b>predictions</b> to the DataFrame.</h1>

In [57]:
df['Predicted Genre'] = predict_genres
df

Unnamed: 0,ID,Description,Year,Predicted Genre
0,1,"L.R. Brane loves his life - his car, his apart...",1998,comedy
1,2,"Spain, March 1964: Quico is a very naughty chi...",1977,drama
2,3,One year in the life of Albin and his family o...,2010,documentary
3,4,"His father has died, he hasn't spoken with his...",2015,drama
4,5,Before he was known internationally as a marti...,1955,drama
...,...,...,...,...
54195,54196,"Covering multiple genres, Tales of Light & Dar...",2013,drama
54196,54197,As Alice and Cora Munro attempt to find their ...,1965,action
54197,54198,"A movie 169 years in the making. Oliver Twist,...",2007,comedy
54198,54199,"Popular, but mysterious rock D.J Mike Mallard ...",1973,drama


In [61]:
df.to_csv(r'/content/drive/MyDrive/Predicted Genre.csv')

<hr>

<h1>Explanation: </h1>

<h1><ul>*   Text Preprocessing: The plot summaries are <b>preprocessed</b> and <b>transformed</b> using the same <b>TF-IDF vectorizer</b> fitted during the <b>training</b> phase.<br><br>
*   Genre Prediction: The <b>trained model</b> predicts genres for each plot summary.<br><br>
*   Result Storage: The predicted genres are stored in a new
column called <b>'Predicted Genre'</b>.</ul></h1>



<hr><hr>

  <a>
    <img src="https://github.com/user-attachments/assets/08ed2cbc-1be6-4690-a6c1-49a98a9787d6" width="600" height="300" />
  </a><br>