In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import ast
from nltk.stem.porter import PorterStemmer
import joblib
df = pd.read_csv('netflix_titles.csv')


In [None]:
for column in df.columns:
    mode = df[column].mode()
    if not mode.empty:
        df[column].fillna(mode[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode[0], inplace=True)


In [None]:
df = df[['show_id','type', 'title', 'director', 'country','cast', 'listed_in','description']]

In [None]:
df.head()

Unnamed: 0,show_id,type,title,director,country,cast,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,David Attenborough,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Rajiv Chilaka,South Africa,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...","International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,United States,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...","Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Rajiv Chilaka,United States,David Attenborough,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Rajiv Chilaka,India,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...","International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
df['description']=df['description'].apply(lambda x:x.split())

In [None]:
df['listed_in']=df['listed_in'].apply(lambda x:x.split(','))

In [None]:
df['cast']=df['cast'].apply(lambda x:x.split(','))

In [None]:
df['director'] = df['director'].str.replace(',', ', ', regex=False).str.replace('\s+', ' ', regex=True).str.strip()

In [None]:
df['director']=df['director'].apply(lambda x:[i.replace(" ","") for i in x])
df['listed_in']=df['listed_in'].apply(lambda x:[i.replace(" ","") for i in x])
df['cast']=df['cast'].apply(lambda x:[i.replace(" ","") for i in x])
df['description']=df['description'].apply(lambda x:[i.replace(" ","") for i in x])
df['country']=df['country'].apply(lambda x:[i.replace(" ","") for i in x])
df['type']=df['type'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
# prompt: df['tags']= df['discription']+df['listed_in']+df['director']+df['cast']+df['country']

df['tags'] = df.apply(
    lambda row: (row['description'] if isinstance(row['description'], list) else []) +
                (row['listed_in'] if isinstance(row['listed_in'], list) else []) +
                (row['director'] if isinstance(row['director'], list) else []) +
                (row['cast'] if isinstance(row['cast'], list) else []) +
                (row['country'] if isinstance(row['country'], list) else []) +
                (row['type'] if isinstance(row['type'], list) else []),
    axis=1
)

In [None]:
new_df = df[['show_id','title','tags']]

In [None]:
new_df['tags'] = new_df['tags'].apply(
    lambda x: " ".join([str(item) for item in x if item is not None and str(item).strip()])
    if isinstance(x, list) else ""
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(


In [None]:
new_df.head()

Unnamed: 0,show_id,title,tags
0,s1,Dick Johnson Is Dead,"As her father nears the end of his life, filmm..."
1,s2,Blood & Water,"After crossing paths at a party, a Cape Town t..."
2,s3,Ganglands,To protect his family from a powerful drug lor...
3,s4,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo..."
4,s5,Kota Factory,In a city of coaching centers known to train I...


In [None]:
cv = CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vectors=cv.fit_transform(new_df['tags']).toarray()

In [None]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
cv.get_feature_names_out()

array(['000', '10', '100', ..., 'zombies', 'zone', 'álvarocervantes'],
      dtype=object)

In [None]:
similarity=cosine_similarity(vectors)

In [None]:
sorted(list(enumerate(similarity[0])),reverse=True , key=lambda x:x[1])[1:6]

[(5797, np.float64(0.38575837490522985)),
 (8746, np.float64(0.3481553119113957)),
 (1171, np.float64(0.3202563076101743)),
 (5233, np.float64(0.3202563076101743)),
 (5239, np.float64(0.3202563076101743))]

In [None]:
def recommend(movie):
    movies_index = new_df[new_df['title']==movie].index[0]
    distance = similarity[movies_index]
    movies_list = sorted(list(enumerate(distance)),reverse=True , key=lambda x:x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [None]:
recommend('Extremis')

Dick Johnson Is Dead
Winnie
David Attenborough: A Life on Our Planet
End Game
Dream Boat


In [None]:
joblib.dump(new_df, "movies.joblib")
joblib.dump(similarity,'similarity.joblib')

['similarity.joblib']