# TV Shows Recommendation Project 
#### by Cosine similarity (content-based)

#

## Data Pre-Proccessing




#### Importing modules, reading data and deleting duplicates

In [99]:
import pandas as pd
import numpy as np

data = pd.read_csv('tv.csv')
data = data.drop_duplicates()
pd.set_option("display.max_columns", None)


##### Drop null values with no title/overview/release_date/poster_path

In [100]:
data = data.dropna(subset=['name', 'overview', 'poster_path', 'first_air_date', 'last_air_date', 'origin_country'])

##### Keeping only non pornographic content

In [101]:

data = data[data['adult'] == False]

##### Removing column which will have no impact on the vectors

In [102]:

data = data.drop(['homepage', 'networks', 'status', 'backdrop_path', 'episode_run_time', 'original_name', 'number_of_seasons', 'number_of_episodes', 'in_production', 'popularity'], axis = 1)

#### Filling important columns which will affect the vectors, will empty string instead of null value

In [103]:
data['genres'] = data['genres'].fillna("")
data['tagline'] = data['tagline'].fillna("")
data['languages'] = data['languages'].fillna("")
data['created_by'] = data['created_by'].fillna("")
data['production_companies'] = data['production_companies'].fillna("")
data['production_countries'] = data['production_countries'].fillna("")
data['spoken_languages'] = data['spoken_languages'].fillna("")


##### Checking Null values if they are present still

In [104]:
data.isnull().sum()

id                      0
name                    0
original_language       0
vote_count              0
vote_average            0
overview                0
adult                   0
first_air_date          0
last_air_date           0
poster_path             0
type                    0
tagline                 0
genres                  0
created_by              0
languages               0
origin_country          0
spoken_languages        0
production_companies    0
production_countries    0
dtype: int64

#### Creating the copy of top 15000 Series

In [105]:
data_sorted = data.sort_values(by='vote_count', ascending=False)
data = data_sorted.iloc[:15000].copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15000 entries, 0 to 15580
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    15000 non-null  int64  
 1   name                  15000 non-null  object 
 2   original_language     15000 non-null  object 
 3   vote_count            15000 non-null  int64  
 4   vote_average          15000 non-null  float64
 5   overview              15000 non-null  object 
 6   adult                 15000 non-null  bool   
 7   first_air_date        15000 non-null  object 
 8   last_air_date         15000 non-null  object 
 9   poster_path           15000 non-null  object 
 10  type                  15000 non-null  object 
 11  tagline               15000 non-null  object 
 12  genres                15000 non-null  object 
 13  created_by            15000 non-null  object 
 14  languages             15000 non-null  object 
 15  origin_country        15

##### Converting sentences to list of words and removing special characters and spaces

In [106]:
#removing spaces
data["genres"] = data["genres"].apply(lambda x: x.replace(" ", ""))
data["tagline"] = data["tagline"].apply(lambda x: x.replace(" ", ""))
data["production_companies"] = data["production_companies"].apply(lambda x: x.replace(" ", ""))
data["production_countries"] = data["production_countries"].apply(lambda x: x.replace(" ", ""))
data["spoken_languages"] = data["spoken_languages"].apply(lambda x: x.replace(" ", ""))
data['languages'] = data['languages'].apply(lambda x: x.replace(" ", ""))
data['created_by'] = data['created_by'].apply(lambda x: x.replace(" ", ""))

#spliting by ',' to create list and storing it in dataframe again
data["genres"] = data["genres"].apply(lambda x: x.split(","))
data["tagline"] = data["tagline"].apply(lambda x: x.split(","))
data["production_companies"] = data["production_companies"].apply(lambda x: x.split(","))
data["production_countries"] = data["production_countries"].apply(lambda x: x.split(","))
data["spoken_languages"] = data["spoken_languages"].apply(lambda x: x.split(","))
data['languages'] = data['languages'].apply(lambda x: x.split(","))
data['created_by'] = data['created_by'].apply(lambda x: x.split(","))
data['overview'] = data['overview'].str.replace(r'[,.!?"]', '', regex=True)
data["overview"] = data["overview"].apply(lambda x: x.split())

data["first_air_date"] = data["first_air_date"].apply(lambda x: int(x[0:4]))
data["last_air_date"] = data["last_air_date"].apply(lambda x: int(x[0:4]))

#### Scaling Numerical values

In [107]:
# Using min-max scaling to scale between 0 and 1
data['vote_average'] = (data['vote_average'] - data['vote_average'].min()) / (data['vote_average'].max() - data['vote_average'].min())
data['vote_count'] = (data['vote_count'] - data['vote_count'].min()) / (data['vote_count'].max() - data['vote_count'].min())
data['first_air_date'] = (data['first_air_date'] - data['first_air_date'].min()) / (data['first_air_date'].max() - data['first_air_date'].min())
data['last_air_date'] = (data['last_air_date'] - data['last_air_date'].min()) / (data['last_air_date'].max() - data['last_air_date'].min())


#### Converting singular value coluns to list too

In [108]:
data["original_language"] = data["original_language"].apply(lambda x: [x])
data["type"] = data["type"].apply(lambda x: [x])
data["origin_country"] = data["origin_country"].apply(lambda x: [x])

#### Creating a new Column 'tags'

In [109]:
data["tags"] = data["overview"]+data["genres"]+data["tagline"]+data["production_companies"]+data["production_countries"]+data["spoken_languages"]+data["type"]+data["original_language"]+data['created_by']+data['origin_country']

#### Dropping columns which were used in making tags column

In [110]:
data = data.drop(['overview', 'adult', 'type', 'tagline', 'genres', 'created_by', 'languages', 'origin_country', 'spoken_languages', 'production_companies', 'production_countries', 'original_language'], axis = 1)

#### Checking a random Movie

In [111]:
data.iloc[-2]

id                                                           137045
name                                                Tokyo 24th Ward
vote_count                                                      0.0
vote_average                                               0.419355
first_air_date                                             0.975904
last_air_date                                              0.971831
poster_path                        /3NYB8HfKALKQ4cHGx8Rx4Dhd0YE.jpg
tags              [The, Far, Eastern, Special, Administrative, R...
Name: 15626, dtype: object

#### Converting tags from list to string and converting it to all lower case

In [112]:
data["tags"] = data["tags"].apply(lambda x: " ".join(x))
data["tags"] = data["tags"].apply(lambda x: x.lower())

#### Using stem to convert words with same parent noun, as the same word
#### Eg. [Loved, Loves, Loving] will be converted to Love

In [113]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [114]:
data["tags"]=data["tags"].apply(stem)

#### Vectorizing using Term Frequency - Inverse Document Frequency to calculate word frequency

In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack

addon = data[['first_air_date', 'last_air_date', 'vote_average']]

tfidf = TfidfVectorizer(max_features=12000, stop_words='english')
vec = tfidf.fit_transform(data['tags'])

combined_matrix = hstack([vec, addon])
combined_matrix = combined_matrix.tocsr()

In [116]:
combined_matrix.shape

(15000, 12003)

### Find similarity matrix using cosine distances

In [117]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(combined_matrix)

##### Writing the similarity matrix into a CSV file after sorting the rows in descending order and find top 20 similar tv shows indexes

In [118]:
import csv

# Open a new CSV file in write mode with UTF-8 encoding
with open('sorted_lists_tv.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    for a in sim:
       
        ml = sorted(list(enumerate(a)), reverse=True, key=lambda x: x[1])[1:21]

        row = [index for index, similarity in ml]
        
        writer.writerow(row)

##### Writing Index into a CSV for accessing id, name, poster from the index, stored in similarity matrix 

In [119]:
data[['id','name','poster_path']].to_csv('tvindex.csv',index=False, header=True)