## Data Preprocessing

### Importing the dataset

In [21]:
import pandas as pd


# Step 1: Read the CSV file with appropriate parameters
df = pd.read_csv('../data/movie_dataset.csv')

dataset = df.head(500)
print(dataset)

summary_statictics = dataset.describe(include='all')

print(summary_statictics)

     index     budget                                    genres  \
0        0  237000000  Action Adventure Fantasy Science Fiction   
1        1  300000000                  Adventure Fantasy Action   
2        2  245000000                    Action Adventure Crime   
3        3  250000000               Action Crime Drama Thriller   
4        4  260000000          Action Adventure Science Fiction   
..     ...        ...                                       ...   
495    495   79000000          Adventure Action Science Fiction   
496    496   78000000                   Animation Family Comedy   
497    497   78000000                     Crime Thriller Horror   
498    498  100000000                         Western Adventure   
499    499   79000000                                    Comedy   

                                         homepage      id  \
0                     http://www.avatarmovie.com/   19995   
1    http://disney.go.com/disneypictures/pirates/     285   
2     http:/

## Visualising data

In [6]:
import matplotlib.pyplot as plt

df['release_date'] = pd.to_datetime(df['release_date'], format='%Y/%m/%d')
df['release_year'] = df['release_date'].dt.year

plt.figure(figsize=(12, 8))
plt.scatter(df['release_year'], df['vote_count'], color='blue', alpha=0.5)
plt.xlabel('Release Year')
plt.ylabel('Vote Average')
plt.title('Movie Budget vs Vote Average')
# plt.xscale('log')  # Optional: Use logarithmic scale for better visualization if there are large ranges
# plt.yscale('log')  # Optional: Use logarithmic scale for better visualization if there are large ranges
plt.tight_layout()
plt.show()

ValueError: time data "2009-12-10" doesn't match format "%Y/%m/%d", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

### Setting independent and dependent variables and removing irrelevant columns

In [41]:
X = dataset.drop(columns=['vote_average', 'budget', 'revenue'])
y = dataset['vote_average']

X['genres'] = X['genres'].str.split(' ')
print(X['genres'])

print(X['release_date'])

X['release_date'] = pd.to_datetime(X['release_date'], format='%Y-%m-%d')
X['release_date'] = X['release_date'].dt.year

print(X['release_date'])


0      [Action, Adventure, Fantasy, Science, Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4               [Action, Adventure, Science, Fiction]
                            ...                      
495             [Adventure, Action, Science, Fiction]
496                       [Animation, Family, Comedy]
497                         [Crime, Thriller, Horror]
498                              [Western, Adventure]
499                                          [Comedy]
Name: genres, Length: 500, dtype: object
0      2009-12-10
1      2007-05-19
2      2015-10-26
3      2012-07-16
4      2012-03-07
          ...    
495    2012-01-19
496    2013-09-26
497    2002-09-29
498    2004-03-05
499    2011-11-11
Name: release_date, Length: 500, dtype: object
0      2009
1      2007
2      2015
3      2012
4      2012
       ... 
495    2012
496    2013
497    2002
498    2004


### Filling in missing data

In [19]:
import numpy as np
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

numeric_columns = [3, 8, 12, 17]
# print(X[:, 17])

X = np.array(X)

for i in numeric_columns:
    X[:, i] = imputer.fit_transform(X[:, i].reshape(-1, 1)).flatten()

print(X)

# imputer.fit(X[:, numeric_columns])
# X[:, numeric_columns] = imputer.transform(X[:, numeric_columns])

[[0 list(['Action', 'Adventure', 'Fantasy', 'Science', 'Fiction'])
  'http://www.avatarmovie.com/' ...
  'Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle Rodriguez'
  '[{\'name\': \'Stephen E. Rivkin\', \'gender\': 0, \'department\': \'Editing\', \'job\': \'Editor\', \'credit_id\': \'52fe48009251416c750aca23\', \'id\': 1721}, {\'name\': \'Rick Carter\', \'gender\': 2, \'department\': \'Art\', \'job\': \'Production Design\', \'credit_id\': \'539c47ecc3a36810e3001f87\', \'id\': 496}, {\'name\': \'Christopher Boyes\', \'gender\': 0, \'department\': \'Sound\', \'job\': \'Sound Designer\', \'credit_id\': \'54491c89c3a3680fb4001cf7\', \'id\': 900}, {\'name\': \'Christopher Boyes\', \'gender\': 0, \'department\': \'Sound\', \'job\': \'Supervising Sound Editor\', \'credit_id\': \'54491cb70e0a267480001bd0\', \'id\': 900}, {\'name\': \'Mali Finn\', \'gender\': 1, \'department\': \'Production\', \'job\': \'Casting\', \'credit_id\': \'539c4a4cc3a36810c9002101\', \'id\': 1262}, {

### Encoding the data

#### Encoding the Independent Variable

In [49]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer

# Fill NaN values in 'genres' column with an empty list
df['genres'] = df['genres'].apply(lambda x: x if isinstance(x, list) else [])


mlb = MultiLabelBinarizer()
encoded_genres = mlb.fit_transform(df['genres'])

encoded_genres_df = pd.DataFrame(encoded_genres, columns=mlb.classes_)


# encoding_indices = [2, 4, 6, 9, 10, 15, 17, 18, 19]
categorical_columns = [6, 10, 11, 15, 16]

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse_output=False), categorical_columns)], remainder='passthrough')

X = ct.fit_transform(X)
print(X)

encoded_df = pd.DataFrame(X, columns=ct.get_feature_names_out())
encoded_df = pd.concat([encoded_df, encoded_genres_df], axis=1)
print(encoded_df)
summary_statistics = encoded_df.describe(include='all')
print(summary_statictics)




[[0.0 1.0 0.0 ...
  'Avatar Action Adventure Fantasy Science Fiction culture clash future space war space colony society'
  1775163.4086
  'action adventure fantasy science fiction culture clash future space war space colony society enter the world of pandora. avatar in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. sam worthington zoe saldana sigourney weaver stephen lang michelle rodriguez [{\'name\': \'stephen e. rivkin\', \'gender\': 0, \'department\': \'editing\', \'job\': \'editor\', \'credit_id\': \'52fe48009251416c750aca23\', \'id\': 1721}, {\'name\': \'rick carter\', \'gender\': 2, \'department\': \'art\', \'job\': \'production design\', \'credit_id\': \'539c47ecc3a36810e3001f87\', \'id\': 496}, {\'name\': \'christopher boyes\', \'gender\': 0, \'department\': \'sound\', \'job\': \'sound designer\', \'credit_id\': \'54491c89c3a3680fb4001cf7\', \'id\': 90

## Tokenising and normalising text data for vectoration

In [44]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


df['genres'] = df['genres'].apply(lambda x: x.lower().split(' ') if isinstance(x, str) else x)
df['keywords'] = df['keywords'].apply(lambda x: x.lower().split(' ') if isinstance(x, str) else x)


# Making a function for the cast which seperates two words into a list item

def split_into_pairs(text):
    if isinstance(text, str):
        # Use regular expression to find pairs of words
        return re.findall(r'\b\w+\s+\w+\b', text)
    return text

df['cast'] = df['cast'].apply(split_into_pairs)





df



Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,"[Action, Adventure, Fantasy, Science, Fiction]",http://www.avatarmovie.com/,19995,"[culture, clash, future, space, war, space, co...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...","[James, Cameron]"
1,1,300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drug, abuse, exotic, island, east, ind...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...","[Gore, Verbinski]"
2,2,245000000,"[Action, Adventure, Crime]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, based, on, novel, secret, agent, sequel,...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,"[Daniel Craig, Christoph Waltz, u00e9a Seydoux...","[{'name': 'Thomas Newman', 'gender': 2, 'depar...","[Sam, Mendes]"
3,3,250000000,"[Action, Crime, Drama, Thriller]",http://www.thedarkknightrises.com/,49026,"[dc, comics, crime, fighter, terrorist, secret...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,"[Christian Bale, Michael Caine, Gary Oldman, A...","[{'name': 'Hans Zimmer', 'gender': 2, 'departm...","[Christopher, Nolan]"
4,4,260000000,"[Action, Adventure, Science, Fiction]",http://movies.disney.com/john-carter,49529,"[based, on, novel, mars, medallion, space, tra...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[{'name': 'Andrew Stanton', 'gender': 2, 'depa...","[Andrew, Stanton]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,4798,220000,"[Action, Crime, Thriller]",,9367,"[united, states\u2013mexico, barrier, legs, ar...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,...,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238,"[Carlos Gallardo, Jaime de, Hoyos Peter, Marqu...","[{'name': 'Robert Rodriguez', 'gender': 0, 'de...","[Robert, Rodriguez]"
4799,4799,9000,"[Comedy, Romance]",,72766,,en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,...,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5,"[Edward Burns, Kerry Bish, u00e9 Marsha, Dietl...","[{'name': 'Edward Burns', 'gender': 2, 'depart...","[Edward, Burns]"
4800,4800,0,"[Comedy, Drama, Romance, TV, Movie]",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[date, love, at, first, sight, narration, inve...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,...,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6,"[Eric Mabius, Kristin Booth, Crystal Lowe, Geo...","[{'name': 'Carla Hetland', 'gender': 0, 'depar...","[Scott, Smith]"
4801,4801,0,,http://shanghaicalling.com/,126186,,en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,...,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7,"[Daniel Henney, Eliza Coupe, Bill Paxton, Alan...","[{'name': 'Daniel Hsia', 'gender': 2, 'departm...","[Daniel, Hsia]"
