In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import ast
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alici\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df_rating = pd.read_csv('../Data_Files/Raw_Data/users_ratings_img_20users_final.csv')
df_rating.head(5)

Unnamed: 0,UserID,Title,Img_Path,Img_File_Name,Year,Description,Directors,Stars,Viewer_Advisory,Duration,...,Color2,Color3,Brightness,Contrast,Saturation,Hue,Texture,Entropy,Noise,Colorfulness
0,ur3223254,The Making of 'West Side Story',https://m.media-amazon.com/images/M/MV5BOTU3Mj...,ur3223254_1.jpg,1985,"A documentary which shows, in great detail, th...",['Christopher Swann'],"['Leonard Bernstein', 'Kiri Te Kanawa', 'José ...",Not Rated,1 hr 29 min,...,"[197.87346221441118, 183.76742823667269, 147.5...","[132.63642065649162, 67.00731758310732, 66.696...",28.345112,1.0,0.483944,0.286751,0.095318,5.033725,2607.615256,0.151818
1,ur3223254,West Side Story,https://m.media-amazon.com/images/M/MV5BMzQ5ZD...,ur3223254_2.jpg,2021,"An adaptation of the 1957 musical, West Side S...",['Steven Spielberg'],"['Ansel Elgort', 'Rachel Zegler', 'Ariana DeBo...",PG-13,2 hr 36 min,...,"[192.77984157334174, 164.10516252390104, 156.2...","[26.031522737713317, 18.571141479099744, 27.59...",56.452657,1.0,0.387084,0.630909,0.117362,7.215298,2937.235783,0.105448
2,ur3223254,Supernova,https://m.media-amazon.com/images/M/MV5BNDc0Y2...,ur3223254_3.jpg,2020,Sam and Tusker are traveling across England in...,['Harry Macqueen'],"['Colin Firth', 'Stanley Tucci', 'Pippa Haywoo...",R,1 hr 35 min,...,"[237.4783657256323, 234.21676833593577, 225.24...","[192.77753168450687, 143.20118124769283, 122.0...",129.423365,1.0,0.336827,0.314486,0.064798,7.568517,5491.649326,0.239863
3,ur3223254,The Many Saints of Newark,https://m.media-amazon.com/images/M/MV5BYmQzNm...,ur3223254_4.jpg,2021,Witness the making of Tony Soprano. The story ...,['Alan Taylor'],"['Alessandro Nivola', 'Leslie Odom Jr.', 'Jon ...",R,2 hr,...,"[227.23344529750602, 227.37020153551038, 227.3...","[123.22488696677817, 87.70277177118118, 88.950...",59.251703,1.0,0.147507,0.146196,0.076623,6.669963,6147.526922,0.11634
4,ur3223254,Never Look Away,https://m.media-amazon.com/images/M/MV5BNGUwYT...,ur3223254_5.jpg,2018,German artist Kurt Barnert has escaped East Ge...,['Florian Henckel von Donnersmarck'],"['Tom Schilling', 'Sebastian Koch', 'Paula Bee...",R,3 hr 9 min,...,"[215.23204559848062, 185.4957568081067, 188.98...","[129.6641665391094, 101.77100872493492, 107.88...",95.750548,1.0,0.333741,0.614238,0.074675,7.714849,4981.923137,0.151493


In [3]:
df_preference = pd.read_csv('../Data_Files/Raw_Data/user_movie_preferences_20users.csv')
df_preference.head(5)

Unnamed: 0,UserID,Top 3 Genres,Top 3 Years
0,ur117926588,"['Short', 'Reality-TV', 'Animation']","['2023', '1964', '1968']"
1,ur15298231,"['Film-Noir', 'Western', 'Documentary']","['1951', '1946', '1948']"
2,ur1994077,"['Western', 'Film-Noir', 'Short']","['1982', '1974', '1967']"
3,ur17646017,"['Mystery', 'Crime', 'Drama']","['2016', '2017', '2015']"
4,ur4532636,"['Documentary', 'Western', 'Film-Noir']","['1962', '1993', '1928']"


In [4]:
nan_counts = df_rating.isna().sum()
print(nan_counts)

UserID                0
Title                 0
Img_Path              0
Img_File_Name         0
Year                  0
Description         114
Directors          1189
Stars               112
Viewer_Advisory    1014
Duration            245
Genre                 4
Votes                 3
Movie_Rating          0
User_Rating           0
Color1                0
Color2                0
Color3                0
Brightness            0
Contrast              0
Saturation            0
Hue                   0
Texture               0
Entropy               0
Noise                 0
Colorfulness          0
dtype: int64


# Data Cleaning

Add Years to movie title (For duplicated userid and titles row since movies can be released on different years)

In [5]:
# identify duplicate rows based on 'userid' and 'title'
duplicate_mask = df_rating.duplicated(subset=['UserID', 'Title'], keep=False)

# apply function to add year of release to the title for each duplicate row
df_rating.loc[duplicate_mask, 'Title'] = df_rating[duplicate_mask].apply(lambda x: f"{x['Title']} ({x['Year']})", axis=1)
df_rating.drop_duplicates(['UserID', 'Title'], keep=False, inplace=True)


In [6]:
df = pd.merge(df_rating, df_preference, on = 'UserID', how = 'left')
df.head(5)

Unnamed: 0,UserID,Title,Img_Path,Img_File_Name,Year,Description,Directors,Stars,Viewer_Advisory,Duration,...,Brightness,Contrast,Saturation,Hue,Texture,Entropy,Noise,Colorfulness,Top 3 Genres,Top 3 Years
0,ur3223254,The Making of 'West Side Story',https://m.media-amazon.com/images/M/MV5BOTU3Mj...,ur3223254_1.jpg,1985,"A documentary which shows, in great detail, th...",['Christopher Swann'],"['Leonard Bernstein', 'Kiri Te Kanawa', 'José ...",Not Rated,1 hr 29 min,...,28.345112,1.0,0.483944,0.286751,0.095318,5.033725,2607.615256,0.151818,"['Documentary', 'War', 'History']","['1998', '1997', '1999']"
1,ur3223254,West Side Story (2021),https://m.media-amazon.com/images/M/MV5BMzQ5ZD...,ur3223254_2.jpg,2021,"An adaptation of the 1957 musical, West Side S...",['Steven Spielberg'],"['Ansel Elgort', 'Rachel Zegler', 'Ariana DeBo...",PG-13,2 hr 36 min,...,56.452657,1.0,0.387084,0.630909,0.117362,7.215298,2937.235783,0.105448,"['Documentary', 'War', 'History']","['1998', '1997', '1999']"
2,ur3223254,Supernova,https://m.media-amazon.com/images/M/MV5BNDc0Y2...,ur3223254_3.jpg,2020,Sam and Tusker are traveling across England in...,['Harry Macqueen'],"['Colin Firth', 'Stanley Tucci', 'Pippa Haywoo...",R,1 hr 35 min,...,129.423365,1.0,0.336827,0.314486,0.064798,7.568517,5491.649326,0.239863,"['Documentary', 'War', 'History']","['1998', '1997', '1999']"
3,ur3223254,The Many Saints of Newark,https://m.media-amazon.com/images/M/MV5BYmQzNm...,ur3223254_4.jpg,2021,Witness the making of Tony Soprano. The story ...,['Alan Taylor'],"['Alessandro Nivola', 'Leslie Odom Jr.', 'Jon ...",R,2 hr,...,59.251703,1.0,0.147507,0.146196,0.076623,6.669963,6147.526922,0.11634,"['Documentary', 'War', 'History']","['1998', '1997', '1999']"
4,ur3223254,Never Look Away,https://m.media-amazon.com/images/M/MV5BNGUwYT...,ur3223254_5.jpg,2018,German artist Kurt Barnert has escaped East Ge...,['Florian Henckel von Donnersmarck'],"['Tom Schilling', 'Sebastian Koch', 'Paula Bee...",R,3 hr 9 min,...,95.750548,1.0,0.333741,0.614238,0.074675,7.714849,4981.923137,0.151493,"['Documentary', 'War', 'History']","['1998', '1997', '1999']"


# Feature Engineering

#### 1 if movie genre is user's top 3 preference else 0

In [7]:
def check_pref_genre(row):
    if isinstance(row['genre_list'], list):
        for ele in row['genre_list']:
            if ele in row['Top 3 Genres']:
                return 1
        return 0
    else:
        return 0
        

df['genre_list'] = df['Genre'].apply(lambda x: x.strip().replace(" ",  "").split(",")  if isinstance(x, str)  else x)
df['is_top3_genre'] = df.apply(check_pref_genre, axis = 1)
df['is_top3_year'] = df.apply(lambda x: 1 if str(x['Year'][:4]) in x['Top 3 Years'] else 0, axis = 1)

In [8]:
df.head(3)

Unnamed: 0,UserID,Title,Img_Path,Img_File_Name,Year,Description,Directors,Stars,Viewer_Advisory,Duration,...,Hue,Texture,Entropy,Noise,Colorfulness,Top 3 Genres,Top 3 Years,genre_list,is_top3_genre,is_top3_year
0,ur3223254,The Making of 'West Side Story',https://m.media-amazon.com/images/M/MV5BOTU3Mj...,ur3223254_1.jpg,1985,"A documentary which shows, in great detail, th...",['Christopher Swann'],"['Leonard Bernstein', 'Kiri Te Kanawa', 'José ...",Not Rated,1 hr 29 min,...,0.286751,0.095318,5.033725,2607.615256,0.151818,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Documentary, Music]",1,0
1,ur3223254,West Side Story (2021),https://m.media-amazon.com/images/M/MV5BMzQ5ZD...,ur3223254_2.jpg,2021,"An adaptation of the 1957 musical, West Side S...",['Steven Spielberg'],"['Ansel Elgort', 'Rachel Zegler', 'Ariana DeBo...",PG-13,2 hr 36 min,...,0.630909,0.117362,7.215298,2937.235783,0.105448,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Crime, Drama, Musical]",0,0
2,ur3223254,Supernova,https://m.media-amazon.com/images/M/MV5BNDc0Y2...,ur3223254_3.jpg,2020,Sam and Tusker are traveling across England in...,['Harry Macqueen'],"['Colin Firth', 'Stanley Tucci', 'Pippa Haywoo...",R,1 hr 35 min,...,0.314486,0.064798,7.568517,5491.649326,0.239863,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Drama, Romance]",0,0


#### Get number of years from now using Year column 

In [9]:
df['Year'].unique()

array(['1985', '2021', '2020', '2018', '2007', '2019', '2017', '2016',
       '1998', '2001', '1972', '1988', '2014', '1997', '2003', '1961',
       '1987', '2012–2019', '2017–2018', '2013', '2015', '2002', '1996',
       '2016–2018', '2009', '2017–2019', '2010–2017', '2016–2017', '2012',
       '2015–2018', '2011', '2014–2019', '1989', '1992', '2010',
       '2014–2016', '2014–2017', '2000', '2006', '1966', '2013–2015',
       '2013–2016', '1986', '1995', '2008', '2004', '2012–2018',
       '2011–2012', '1971', '2012–2014', '1993', '1994', '2011–2019',
       '2011–2013', '1999', '2010–2015', '2005', '1983', '2008–2015',
       '1990', '2010–2013', '2007–2010', '1974', '1985–1998', '2001–2003',
       '1976', '1994–1995', '1999–2002', '1978', '1991', '2004–2007',
       '1984', '1981', '1993–', '2005–2007', '1973', '1975', '1997–',
       '1965', '2022', '2023', '1944', '1945', '1941', '1942', '1939',
       '1938', '1962', '1968', '1980', '1963', '1982', '1967', '1931',
       '1979'

In [10]:
df['num_years_released'] = df['Year'].apply(lambda x: 2023 - int(x) if len(x.split('–')) == 0 else 2023 - int(x.split('–')[0]))
df.head(5)

Unnamed: 0,UserID,Title,Img_Path,Img_File_Name,Year,Description,Directors,Stars,Viewer_Advisory,Duration,...,Texture,Entropy,Noise,Colorfulness,Top 3 Genres,Top 3 Years,genre_list,is_top3_genre,is_top3_year,num_years_released
0,ur3223254,The Making of 'West Side Story',https://m.media-amazon.com/images/M/MV5BOTU3Mj...,ur3223254_1.jpg,1985,"A documentary which shows, in great detail, th...",['Christopher Swann'],"['Leonard Bernstein', 'Kiri Te Kanawa', 'José ...",Not Rated,1 hr 29 min,...,0.095318,5.033725,2607.615256,0.151818,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Documentary, Music]",1,0,38
1,ur3223254,West Side Story (2021),https://m.media-amazon.com/images/M/MV5BMzQ5ZD...,ur3223254_2.jpg,2021,"An adaptation of the 1957 musical, West Side S...",['Steven Spielberg'],"['Ansel Elgort', 'Rachel Zegler', 'Ariana DeBo...",PG-13,2 hr 36 min,...,0.117362,7.215298,2937.235783,0.105448,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Crime, Drama, Musical]",0,0,2
2,ur3223254,Supernova,https://m.media-amazon.com/images/M/MV5BNDc0Y2...,ur3223254_3.jpg,2020,Sam and Tusker are traveling across England in...,['Harry Macqueen'],"['Colin Firth', 'Stanley Tucci', 'Pippa Haywoo...",R,1 hr 35 min,...,0.064798,7.568517,5491.649326,0.239863,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Drama, Romance]",0,0,3
3,ur3223254,The Many Saints of Newark,https://m.media-amazon.com/images/M/MV5BYmQzNm...,ur3223254_4.jpg,2021,Witness the making of Tony Soprano. The story ...,['Alan Taylor'],"['Alessandro Nivola', 'Leslie Odom Jr.', 'Jon ...",R,2 hr,...,0.076623,6.669963,6147.526922,0.11634,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Crime, Drama]",0,0,2
4,ur3223254,Never Look Away,https://m.media-amazon.com/images/M/MV5BNGUwYT...,ur3223254_5.jpg,2018,German artist Kurt Barnert has escaped East Ge...,['Florian Henckel von Donnersmarck'],"['Tom Schilling', 'Sebastian Koch', 'Paula Bee...",R,3 hr 9 min,...,0.074675,7.714849,4981.923137,0.151493,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Biography, Drama, Romance]",0,0,5


#### Cleaned up and merged similar viewer advisory

In [11]:
df['Viewer_Advisory'].unique()

array(['Not Rated', 'PG-13', 'R', 'TV-MA', 'PG', 'TV-14', 'TV-PG',
       'Approved', 'M18', nan, 'Unrated', 'PG13', 'R21', 'TV-G', 'NC-17',
       'NC-16', 'G', '(Banned)', 'R(A)', 'Passed', 'NC16', 'GP', 'TV-Y7',
       'TV-Y', 'M/PG', '16+', 'MA-17', 'M', '12', 'X', 'T', 'TV-Y7-FV',
       'E', '13+', 'K-A', 'E10+', 'AO'], dtype=object)

In [12]:
len(df['Viewer_Advisory'].unique())

37

In [13]:
def merge_viewer(x):
    gp = ['G', 'GP', 'TV-G', 'E', 'K-A']
    pg = ['PG-13', 'PG','PG13', 'TV-PG', '13+', 'M/PG', 'T', '12', 'TV-14']
    children = ['TV-Y', 'TV-Y7', 'TV-Y7-FV', 'E10+']
    nc = ['NC-17', 'MA-17', 'NC16', 'NC-16', '17+', '16+', 'X']
    m = ['M', "M-18", 'TV-MA', 'M18', 'R(A)', 'AO']
    r =['R', 'R21']
    unrated = ['Unrated', 'Not Rated', 'Approved', 'Passed']
    
    if x in pg:
        return "PG"
    elif x in gp:
        return 'GP'
    elif x in children:
        return 'Children'
    elif x in nc:
        return "NC-17"
    elif x in m:
        return "M"
    elif x in r:
        return "R21"
    elif x in unrated:
        return "Unrated"
    else:
        return x
    
df['Viewer_Advisory'] = df['Viewer_Advisory'].apply(merge_viewer)

In [14]:
print(df['Viewer_Advisory'].unique())
len(df['Viewer_Advisory'].unique())

['Unrated' 'PG' 'R21' 'M' nan 'GP' 'NC-17' '(Banned)' 'Children']


9

Number of categories for viewer advisory has been reduced from 37 to 8 (exclude nan).

#### Convert movie duration to minutes

In [15]:
def convert_to_mins(x):
    if not isinstance(x, str):
        return x
    elif len(x.split(" "))>2:
        return 60*int(x.split(" ")[0]) + int(x.split(" ")[2])
    else:
        return int(x.split(" ")[0])
                   
df['duration_mins'] = df['Duration'].apply(convert_to_mins)
df.head(5)             

Unnamed: 0,UserID,Title,Img_Path,Img_File_Name,Year,Description,Directors,Stars,Viewer_Advisory,Duration,...,Entropy,Noise,Colorfulness,Top 3 Genres,Top 3 Years,genre_list,is_top3_genre,is_top3_year,num_years_released,duration_mins
0,ur3223254,The Making of 'West Side Story',https://m.media-amazon.com/images/M/MV5BOTU3Mj...,ur3223254_1.jpg,1985,"A documentary which shows, in great detail, th...",['Christopher Swann'],"['Leonard Bernstein', 'Kiri Te Kanawa', 'José ...",Unrated,1 hr 29 min,...,5.033725,2607.615256,0.151818,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Documentary, Music]",1,0,38,89.0
1,ur3223254,West Side Story (2021),https://m.media-amazon.com/images/M/MV5BMzQ5ZD...,ur3223254_2.jpg,2021,"An adaptation of the 1957 musical, West Side S...",['Steven Spielberg'],"['Ansel Elgort', 'Rachel Zegler', 'Ariana DeBo...",PG,2 hr 36 min,...,7.215298,2937.235783,0.105448,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Crime, Drama, Musical]",0,0,2,156.0
2,ur3223254,Supernova,https://m.media-amazon.com/images/M/MV5BNDc0Y2...,ur3223254_3.jpg,2020,Sam and Tusker are traveling across England in...,['Harry Macqueen'],"['Colin Firth', 'Stanley Tucci', 'Pippa Haywoo...",R21,1 hr 35 min,...,7.568517,5491.649326,0.239863,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Drama, Romance]",0,0,3,95.0
3,ur3223254,The Many Saints of Newark,https://m.media-amazon.com/images/M/MV5BYmQzNm...,ur3223254_4.jpg,2021,Witness the making of Tony Soprano. The story ...,['Alan Taylor'],"['Alessandro Nivola', 'Leslie Odom Jr.', 'Jon ...",R21,2 hr,...,6.669963,6147.526922,0.11634,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Crime, Drama]",0,0,2,2.0
4,ur3223254,Never Look Away,https://m.media-amazon.com/images/M/MV5BNGUwYT...,ur3223254_5.jpg,2018,German artist Kurt Barnert has escaped East Ge...,['Florian Henckel von Donnersmarck'],"['Tom Schilling', 'Sebastian Koch', 'Paula Bee...",R21,3 hr 9 min,...,7.714849,4981.923137,0.151493,"['Documentary', 'War', 'History']","['1998', '1997', '1999']","[Biography, Drama, Romance]",0,0,5,189.0


#### Merge Similar Genre

In [16]:
unique_genres = set()
for index, row in df.iterrows():
    if isinstance(row['genre_list'], list):
        for genres in row["genre_list"]:
            unique_genres.add(genres)

unique_genres

{'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western'}

In [17]:
len(unique_genres)

28

In [18]:
def merge_genres(x):
    scifi_fan = [ 'Sci-Fi', 'Fantasy']
    reality = ['Documentary', 'Reality-TV', 'News', 'Talk-Show', 'Game-Show']
    horror_thriller = ['Horror', "Thriller"]
    crime_mystery_noir = ['Crime', 'Mystery', 'Film-Noir']
    music = ['Music', 'Musical']
    sport_action_ad = ['Sport', 'Action', 'Adventure']
    war_hist_bio = ['War', "History", 'Biography']
    romcom = ['Romance', 'Comedy']

    new_genre_list = []

    if isinstance(x, list):
        for g in x:
            if g in scifi_fan and g not in new_genre_list:
                new_genre_list.append('Ad/SciFi/Fantasy')
            elif g in reality and g not in new_genre_list:
                new_genre_list.append('Reality')
            elif g in horror_thriller and g not in new_genre_list:
                new_genre_list.append('Horror/Thriller')
            elif g in crime_mystery_noir and g not in new_genre_list:
                new_genre_list.append('Crime/Mystery/Film-Noir')
            elif g in music and g not in new_genre_list:
                new_genre_list.append('Music')
            elif g in sport_action_ad and g not in new_genre_list:
                new_genre_list.append('Sport/Action/Adventure')
            elif g in war_hist_bio and g not in new_genre_list:
                new_genre_list.append('War/History/Biography')
            elif g in romcom and g not in new_genre_list:
                new_genre_list.append('RomCom')
            else:
                new_genre_list.append(g)

    return new_genre_list

df['genre_list'] = df['genre_list'].apply(merge_genres)
df['genre_list']
    

0                                         [Reality, Music]
1                  [Crime/Mystery/Film-Noir, Drama, Music]
2                                          [Drama, RomCom]
3                         [Crime/Mystery/Film-Noir, Drama]
4                   [War/History/Biography, Drama, RomCom]
                               ...                        
37793           [Sport/Action/Adventure, Ad/SciFi/Fantasy]
37794    [Sport/Action/Adventure, Crime/Mystery/Film-No...
37795    [Drama, Ad/SciFi/Fantasy, Crime/Mystery/Film-N...
37796    [Sport/Action/Adventure, Sport/Action/Adventur...
37797                                  [Animation, RomCom]
Name: genre_list, Length: 37798, dtype: object

In [19]:
unique_genres = set()
for index, row in df.iterrows():
    if isinstance(row['genre_list'], list):
        for genres in row["genre_list"]:
            unique_genres.add(genres)

unique_genres

{'Ad/SciFi/Fantasy',
 'Adult',
 'Animation',
 'Crime/Mystery/Film-Noir',
 'Drama',
 'Family',
 'Horror/Thriller',
 'Music',
 'Reality',
 'RomCom',
 'Short',
 'Sport/Action/Adventure',
 'War/History/Biography',
 'Western'}

In [20]:
len(unique_genres)

14

Number of genres have been reduced from 28 to 14

# Label Encoding and One Hot Encoding

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
res1 = pd.DataFrame(mlb.fit_transform(df['genre_list']),
                   columns=mlb.classes_,
                   index=df.index)

one_hot_viewer = pd.get_dummies(df['Viewer_Advisory'], prefix='viewer_')

df = pd.concat([df, res1, one_hot_viewer], axis = 1)

In [22]:
df.head(2)

Unnamed: 0,UserID,Title,Img_Path,Img_File_Name,Year,Description,Directors,Stars,Viewer_Advisory,Duration,...,War/History/Biography,Western,viewer__(Banned),viewer__Children,viewer__GP,viewer__M,viewer__NC-17,viewer__PG,viewer__R21,viewer__Unrated
0,ur3223254,The Making of 'West Side Story',https://m.media-amazon.com/images/M/MV5BOTU3Mj...,ur3223254_1.jpg,1985,"A documentary which shows, in great detail, th...",['Christopher Swann'],"['Leonard Bernstein', 'Kiri Te Kanawa', 'José ...",Unrated,1 hr 29 min,...,0,0,0,0,0,0,0,0,0,1
1,ur3223254,West Side Story (2021),https://m.media-amazon.com/images/M/MV5BMzQ5ZD...,ur3223254_2.jpg,2021,"An adaptation of the 1957 musical, West Side S...",['Steven Spielberg'],"['Ansel Elgort', 'Rachel Zegler', 'Ariana DeBo...",PG,2 hr 36 min,...,0,0,0,0,0,0,0,1,0,0


In [23]:
df.drop(columns= ['Img_Path', 'Img_File_Name','Description', 'Year', 'Directors', 'Stars', 'Color1', 'Color2', 'Color3','Viewer_Advisory', 'Duration', 'Genre', 'genre_list', 'Top 3 Years', 'Top 3 Genres'], inplace=True)
df.head(1)

Unnamed: 0,UserID,Title,Votes,Movie_Rating,User_Rating,Brightness,Contrast,Saturation,Hue,Texture,...,War/History/Biography,Western,viewer__(Banned),viewer__Children,viewer__GP,viewer__M,viewer__NC-17,viewer__PG,viewer__R21,viewer__Unrated
0,ur3223254,The Making of 'West Side Story',186.0,8.1,10,28.345112,1.0,0.483944,0.286751,0.095318,...,0,0,0,0,0,0,0,0,0,1


In [26]:
len(df.columns)

39

In [27]:
df.to_csv('numerical_features_cleaned.csv', index= False)