# CA 4

### The purpose of this project is to learn machine learning methods with Scikit-Learn library.

# Imports

In [1]:
import numpy as np
import pandas as pd
import copy as cp
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

# Body

# Phase 0 : Data Analysis

In [2]:
df = pd.read_csv("dataset.csv")

In [3]:
df

Unnamed: 0,type,title,cast,country,release_year,listed_in,description
0,Movie,Dick Johnson Is Dead,,United States,2020,Documentaries,"As her father nears the end of his life, filmm..."
1,TV Show,Blood & Water,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,TV Show,Ganglands,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,TV Show,Jailbirds New Orleans,,,2021,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,TV Show,Kota Factory,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...
11054,TV Show,X-Men: Evolution,"Noel Fisher, Vincent Gale, Christopher Judge, ...",United States,2000,"Action-Adventure, Animation, Kids",X-Men: Evolution features the team as teenager...
11055,TV Show,Smart Guy,"Tahj Mowry, John Jones, Jason Weaver, Essence ...",United States,1996,"Comedy, Coming of Age, Kids",A genius tries to fit in as a high school soph...
11056,TV Show,Disney Kirby Buckets,"Jacob Bertrand, Mekai Curtis, Cade Sutton, Oli...",United States,2014,"Action-Adventure, Comedy, Coming of Age",Welcome to Kirby's world! It's rude and sketchy.
11057,TV Show,Disney Mech-X4,"Nathaniel Potvin, Raymond Cham, Kamran Lucas, ...",Canada,2016,"Action-Adventure, Comedy, Science Fiction",Ryan discovers his ability to control a giant ...


### As seen above , dataset was loaded successfully.

## 1. Dataset information:

In [4]:
for title in df:
    print("In", title, "column :")
    print(df[title].describe())
    print("*"*50)

In type column :
count     11059
unique        2
top       Movie
freq       6131
Name: type, dtype: object
**************************************************
In title column :
count              11059
unique             10957
top       Sister, Sister
freq                   2
Name: title, dtype: object
**************************************************
In cast column :
count                   9694
unique                  9307
top       David Attenborough
freq                      20
Name: cast, dtype: object
**************************************************
In country column :
count              8364
unique              760
top       United States
freq               3105
Name: country, dtype: object
**************************************************
In release_year column :
count    11059.000000
mean      2014.209603
std          8.959517
min       1925.000000
25%       2013.000000
50%       2017.000000
75%       2019.000000
max       2021.000000
Name: release_year, dtype: float64
****

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11059 entries, 0 to 11058
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          11059 non-null  object
 1   title         11059 non-null  object
 2   cast          9694 non-null   object
 3   country       8364 non-null   object
 4   release_year  11059 non-null  int64 
 5   listed_in     11059 non-null  object
 6   description   11059 non-null  object
dtypes: int64(1), object(6)
memory usage: 604.9+ KB


## 2. percentage of data lost:

In [6]:
for title in df:
    print("Percentage of data lost in", title, "column :")
    data_lost = df[title].isnull()
    percentage = (data_lost.sum() / data_lost.count()) * 100
    print(percentage)
    print("*"*50)

Percentage of data lost in type column :
0.0
**************************************************
Percentage of data lost in title column :
0.0
**************************************************
Percentage of data lost in cast column :
12.342888145401936
**************************************************
Percentage of data lost in country column :
24.36929197938331
**************************************************
Percentage of data lost in release_year column :
0.0
**************************************************
Percentage of data lost in listed_in column :
0.0
**************************************************
Percentage of data lost in description column :
0.0
**************************************************


# Phase 1 : Preprocess

## 1. Handle Null Data:

### first method: drop rows with at least one Nan value

### second method: fill Nan values with their column's mode.

### chose second method because we want to use other information in rows with at leat one Nan value and we don't want to minify our dataset

In [7]:
for title in df:
    df[title] = df[title].fillna(df[title].mode()[0])

In [8]:
for title in df:
    print("Percentage of data lost in", title, "column :")
    data_lost = df[title].isnull()
    percentage = (data_lost.sum() / data_lost.count()) * 100
    print(percentage)
    print("*"*50)

Percentage of data lost in type column :
0.0
**************************************************
Percentage of data lost in title column :
0.0
**************************************************
Percentage of data lost in cast column :
0.0
**************************************************
Percentage of data lost in country column :
0.0
**************************************************
Percentage of data lost in release_year column :
0.0
**************************************************
Percentage of data lost in listed_in column :
0.0
**************************************************
Percentage of data lost in description column :
0.0
**************************************************


### As seen above , lost data was filled with mode of columns successfully.

## 2. Normalization VS Standardization:

Normalization is a scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1. It is also known as Min-Max scaling.

Standardization is another scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute becomes zero and the resultant distribution has a unit standard deviation.


Normalization is good to use when you know that the distribution of your data does not follow a Gaussian distribution. This can be useful in algorithms that do not assume any distribution of the data like K-Nearest Neighbors and Neural Networks.
Standardization, on the other hand, can be helpful in cases where the data follows a Gaussian distribution. However, this does not have to be necessarily true. Also, unlike normalization, standardization does not have a bounding range. So, even if you have outliers in your data, they will not be affected by standardization.


Standardization comes into picture when features of input data set have large differences between their ranges, or simply when they are measured in different measurement units (e.g., Pounds, Meters, Miles … etc).

These differences in the ranges of initial features causes trouble to many machine learning models. For example, for the models that are based on distance computation, if one of the features has a broad range of values, the distance will be governed by this particular feature.

resources: https://www.analyticsvidhya.com/blog/2020/04/feature-scaling-machine-learning-normalization-standardization/
https://builtin.com/data-science/when-and-why-standardize-your-data

### In this project, Normalization and Standardization don't effect on result because we use tree based algorithms(Decision Tree and Random Forrest) for this project. so we choose Normalization to just choose!

In [9]:
df["release_year"]

0        2020
1        2021
2        2021
3        2021
4        2021
         ... 
11054    2000
11055    1996
11056    2014
11057    2016
11058    2008
Name: release_year, Length: 11059, dtype: int64

In [10]:
df["release_year"] = (df["release_year"] - df["release_year"].mean()) / df["release_year"].std()

In [11]:
df["release_year"]

0        0.646285
1        0.757898
2        0.757898
3        0.757898
4        0.757898
           ...   
11054   -1.585979
11055   -2.032431
11056   -0.023394
11057    0.199832
11058   -0.693073
Name: release_year, Length: 11059, dtype: float64

### Normalization by subtract mean and divide by the standard deviation

## 4. Handle listed_in values

### For handle this column, we can separate values or concatenate values.

### We separate listed_in values in to new columns and then remove listed_in column. Because the genres are deferent and most be considered separatly.

In [12]:
df

Unnamed: 0,type,title,cast,country,release_year,listed_in,description
0,Movie,Dick Johnson Is Dead,David Attenborough,United States,0.646285,Documentaries,"As her father nears the end of his life, filmm..."
1,TV Show,Blood & Water,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,0.757898,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,TV Show,Ganglands,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,0.757898,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,TV Show,Jailbirds New Orleans,David Attenborough,United States,0.757898,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,TV Show,Kota Factory,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,0.757898,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...
11054,TV Show,X-Men: Evolution,"Noel Fisher, Vincent Gale, Christopher Judge, ...",United States,-1.585979,"Action-Adventure, Animation, Kids",X-Men: Evolution features the team as teenager...
11055,TV Show,Smart Guy,"Tahj Mowry, John Jones, Jason Weaver, Essence ...",United States,-2.032431,"Comedy, Coming of Age, Kids",A genius tries to fit in as a high school soph...
11056,TV Show,Disney Kirby Buckets,"Jacob Bertrand, Mekai Curtis, Cade Sutton, Oli...",United States,-0.023394,"Action-Adventure, Comedy, Coming of Age",Welcome to Kirby's world! It's rude and sketchy.
11057,TV Show,Disney Mech-X4,"Nathaniel Potvin, Raymond Cham, Kamran Lucas, ...",Canada,0.199832,"Action-Adventure, Comedy, Science Fiction",Ryan discovers his ability to control a giant ...


In [13]:
max_num_of_values = 0
for item in df["listed_in"]:
    if len(item.split(", ")) > max_num_of_values:
        max_num_of_values = len(item.split(", "))

In [14]:
print("Number of columns we need to separate the listed_in columns :", max_num_of_values)

Number of columns we need to separate the listed_in columns : 5


In [15]:
for i in range(0, max_num_of_values):
    ls = []
    for item in df["listed_in"]:
        if len(item.split(", ")) <= i:
            index = i % len(item.split(", "))
            ls.append(item.split(", ")[index])
        else:
            ls.append(item.split(", ")[i])
    column_name = "genre" + str(i)
    df[column_name] = ls 

In [16]:
df

Unnamed: 0,type,title,cast,country,release_year,listed_in,description,genre0,genre1,genre2,genre3,genre4
0,Movie,Dick Johnson Is Dead,David Attenborough,United States,0.646285,Documentaries,"As her father nears the end of his life, filmm...",Documentaries,Documentaries,Documentaries,Documentaries,Documentaries
1,TV Show,Blood & Water,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,0.757898,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",International TV Shows,TV Dramas,TV Mysteries,International TV Shows,TV Dramas
2,TV Show,Ganglands,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,0.757898,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,Crime TV Shows,International TV Shows,TV Action & Adventure,Crime TV Shows,International TV Shows
3,TV Show,Jailbirds New Orleans,David Attenborough,United States,0.757898,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",Docuseries,Reality TV,Docuseries,Reality TV,Docuseries
4,TV Show,Kota Factory,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,0.757898,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,International TV Shows,Romantic TV Shows,TV Comedies,International TV Shows,Romantic TV Shows
...,...,...,...,...,...,...,...,...,...,...,...,...
11054,TV Show,X-Men: Evolution,"Noel Fisher, Vincent Gale, Christopher Judge, ...",United States,-1.585979,"Action-Adventure, Animation, Kids",X-Men: Evolution features the team as teenager...,Action-Adventure,Animation,Kids,Action-Adventure,Animation
11055,TV Show,Smart Guy,"Tahj Mowry, John Jones, Jason Weaver, Essence ...",United States,-2.032431,"Comedy, Coming of Age, Kids",A genius tries to fit in as a high school soph...,Comedy,Coming of Age,Kids,Comedy,Coming of Age
11056,TV Show,Disney Kirby Buckets,"Jacob Bertrand, Mekai Curtis, Cade Sutton, Oli...",United States,-0.023394,"Action-Adventure, Comedy, Coming of Age",Welcome to Kirby's world! It's rude and sketchy.,Action-Adventure,Comedy,Coming of Age,Action-Adventure,Comedy
11057,TV Show,Disney Mech-X4,"Nathaniel Potvin, Raymond Cham, Kamran Lucas, ...",Canada,0.199832,"Action-Adventure, Comedy, Science Fiction",Ryan discovers his ability to control a giant ...,Action-Adventure,Comedy,Science Fiction,Action-Adventure,Comedy


### delete listed_in column:

In [17]:
df.drop(labels = "listed_in", axis = 1, inplace = True)

In [18]:
df

Unnamed: 0,type,title,cast,country,release_year,description,genre0,genre1,genre2,genre3,genre4
0,Movie,Dick Johnson Is Dead,David Attenborough,United States,0.646285,"As her father nears the end of his life, filmm...",Documentaries,Documentaries,Documentaries,Documentaries,Documentaries
1,TV Show,Blood & Water,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,0.757898,"After crossing paths at a party, a Cape Town t...",International TV Shows,TV Dramas,TV Mysteries,International TV Shows,TV Dramas
2,TV Show,Ganglands,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,0.757898,To protect his family from a powerful drug lor...,Crime TV Shows,International TV Shows,TV Action & Adventure,Crime TV Shows,International TV Shows
3,TV Show,Jailbirds New Orleans,David Attenborough,United States,0.757898,"Feuds, flirtations and toilet talk go down amo...",Docuseries,Reality TV,Docuseries,Reality TV,Docuseries
4,TV Show,Kota Factory,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,0.757898,In a city of coaching centers known to train I...,International TV Shows,Romantic TV Shows,TV Comedies,International TV Shows,Romantic TV Shows
...,...,...,...,...,...,...,...,...,...,...,...
11054,TV Show,X-Men: Evolution,"Noel Fisher, Vincent Gale, Christopher Judge, ...",United States,-1.585979,X-Men: Evolution features the team as teenager...,Action-Adventure,Animation,Kids,Action-Adventure,Animation
11055,TV Show,Smart Guy,"Tahj Mowry, John Jones, Jason Weaver, Essence ...",United States,-2.032431,A genius tries to fit in as a high school soph...,Comedy,Coming of Age,Kids,Comedy,Coming of Age
11056,TV Show,Disney Kirby Buckets,"Jacob Bertrand, Mekai Curtis, Cade Sutton, Oli...",United States,-0.023394,Welcome to Kirby's world! It's rude and sketchy.,Action-Adventure,Comedy,Coming of Age,Action-Adventure,Comedy
11057,TV Show,Disney Mech-X4,"Nathaniel Potvin, Raymond Cham, Kamran Lucas, ...",Canada,0.199832,Ryan discovers his ability to control a giant ...,Action-Adventure,Comedy,Science Fiction,Action-Adventure,Comedy


## Extract Atribute from Text

### Concatenate Title and Description columns:

In [19]:
df["information"] = df["title"] + " " + df["description"]
df.drop(labels = ["title", "description"], axis = 1, inplace = True)

In [20]:
df

Unnamed: 0,type,cast,country,release_year,genre0,genre1,genre2,genre3,genre4,information
0,Movie,David Attenborough,United States,0.646285,Documentaries,Documentaries,Documentaries,Documentaries,Documentaries,Dick Johnson Is Dead As her father nears the e...
1,TV Show,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,0.757898,International TV Shows,TV Dramas,TV Mysteries,International TV Shows,TV Dramas,"Blood & Water After crossing paths at a party,..."
2,TV Show,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,0.757898,Crime TV Shows,International TV Shows,TV Action & Adventure,Crime TV Shows,International TV Shows,Ganglands To protect his family from a powerfu...
3,TV Show,David Attenborough,United States,0.757898,Docuseries,Reality TV,Docuseries,Reality TV,Docuseries,"Jailbirds New Orleans Feuds, flirtations and t..."
4,TV Show,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,0.757898,International TV Shows,Romantic TV Shows,TV Comedies,International TV Shows,Romantic TV Shows,Kota Factory In a city of coaching centers kno...
...,...,...,...,...,...,...,...,...,...,...
11054,TV Show,"Noel Fisher, Vincent Gale, Christopher Judge, ...",United States,-1.585979,Action-Adventure,Animation,Kids,Action-Adventure,Animation,X-Men: Evolution X-Men: Evolution features the...
11055,TV Show,"Tahj Mowry, John Jones, Jason Weaver, Essence ...",United States,-2.032431,Comedy,Coming of Age,Kids,Comedy,Coming of Age,Smart Guy A genius tries to fit in as a high s...
11056,TV Show,"Jacob Bertrand, Mekai Curtis, Cade Sutton, Oli...",United States,-0.023394,Action-Adventure,Comedy,Coming of Age,Action-Adventure,Comedy,Disney Kirby Buckets Welcome to Kirby's world!...
11057,TV Show,"Nathaniel Potvin, Raymond Cham, Kamran Lucas, ...",Canada,0.199832,Action-Adventure,Comedy,Science Fiction,Action-Adventure,Comedy,Disney Mech-X4 Ryan discovers his ability to c...


In [21]:
df_more_information = cp.deepcopy(df)

In [22]:
cv = CountVectorizer()

In [23]:
matrix = cv.fit_transform(df["information"])

In [24]:
count = pd.DataFrame(matrix.toarray(),columns=cv.get_feature_names_out())

In [25]:
top30 = count.sum().sort_values(ascending = False)[0:30]
top60 = count.sum().sort_values(ascending = False)[0:60]

In [26]:
top30

the      16240
and       9939
to        8978
of        8920
in        6427
his       3992
with      3334
her       2646
for       2560
on        2547
an        2485
their     2261
is        2223
from      1868
this      1865
as        1841
when      1731
that      1452
he        1420
by        1401
who       1350
life      1279
after     1147
new       1146
they      1105
up        1083
at        1078
but       1066
world     1061
into       973
dtype: int64

### Top 50 words in information column sorted by their frequency

In [27]:
top30_words = top30.index
top60_words = top60.index

In [28]:
for word in top30_words:
    df[word] = count[word]
df.drop(labels = "information", axis = 1, inplace = True)

for word in top60_words:
    df_more_information[word] = count[word]
df_more_information.drop(labels = "information", axis = 1, inplace = True)

In [29]:
df

Unnamed: 0,type,cast,country,release_year,genre0,genre1,genre2,genre3,genre4,the,...,who,life,after,new,they,up,at,but,world,into
0,Movie,David Attenborough,United States,0.646285,Documentaries,Documentaries,Documentaries,Documentaries,Documentaries,2,...,0,1,0,0,0,0,0,0,0,0
1,TV Show,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,0.757898,International TV Shows,TV Dramas,TV Mysteries,International TV Shows,TV Dramas,0,...,1,0,1,0,0,0,2,0,0,0
2,TV Show,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,0.757898,Crime TV Shows,International TV Shows,TV Action & Adventure,Crime TV Shows,International TV Shows,0,...,0,0,0,0,0,0,0,0,0,1
3,TV Show,David Attenborough,United States,0.757898,Docuseries,Reality TV,Docuseries,Reality TV,Docuseries,2,...,0,0,0,2,0,0,1,0,0,0
4,TV Show,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,0.757898,International TV Shows,Romantic TV Shows,TV Comedies,International TV Shows,Romantic TV Shows,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11054,TV Show,"Noel Fisher, Vincent Gale, Christopher Judge, ...",United States,-1.585979,Action-Adventure,Animation,Kids,Action-Adventure,Animation,1,...,0,0,0,0,1,0,0,0,1,0
11055,TV Show,"Tahj Mowry, John Jones, Jason Weaver, Essence ...",United States,-2.032431,Comedy,Coming of Age,Kids,Comedy,Coming of Age,0,...,0,0,0,0,0,0,0,0,0,0
11056,TV Show,"Jacob Bertrand, Mekai Curtis, Cade Sutton, Oli...",United States,-0.023394,Action-Adventure,Comedy,Coming of Age,Action-Adventure,Comedy,0,...,0,0,0,0,0,0,0,0,1,0
11057,TV Show,"Nathaniel Potvin, Raymond Cham, Kamran Lucas, ...",Canada,0.199832,Action-Adventure,Comedy,Science Fiction,Action-Adventure,Comedy,0,...,0,0,0,0,0,0,0,0,0,0


### Words added to dataframe, successfully

### Do same things for 'cast' column:

In [30]:
matrix = cv.fit_transform(df["cast"])

In [31]:
count = pd.DataFrame(matrix.toarray(),columns=cv.get_feature_names_out())

In [32]:
top10 = count.sum().sort_values(ascending = False)[0:10]
top20 = count.sum().sort_values(ascending = False)[0:20]

In [33]:
top10

david           2010
attenborough    1387
michael          748
john             654
james            511
lee              502
paul             398
kim              368
de               328
tom              302
dtype: int64

### Top 10 words in information column sorted by their frequency

In [34]:
top10_words = top10.index
top20_words = top20.index

In [35]:
for word in top10_words:
    df[word] = count[word]
df.drop(labels = "cast", axis = 1, inplace = True)
for word in top20_words:
    df_more_information[word] = count[word]
df_more_information.drop(labels = "cast", axis = 1, inplace = True)

In [36]:
df

Unnamed: 0,type,country,release_year,genre0,genre1,genre2,genre3,genre4,the,and,...,david,attenborough,michael,john,james,lee,paul,kim,de,tom
0,Movie,United States,0.646285,Documentaries,Documentaries,Documentaries,Documentaries,Documentaries,2,1,...,1,1,0,0,0,0,0,0,0,0
1,TV Show,South Africa,0.757898,International TV Shows,TV Dramas,TV Mysteries,International TV Shows,TV Dramas,0,0,...,0,0,0,0,0,0,0,0,1,0
2,TV Show,United States,0.757898,Crime TV Shows,International TV Shows,TV Action & Adventure,Crime TV Shows,International TV Shows,0,2,...,0,0,0,0,0,0,0,0,0,0
3,TV Show,United States,0.757898,Docuseries,Reality TV,Docuseries,Reality TV,Docuseries,2,1,...,1,1,0,0,0,0,0,0,0,0
4,TV Show,India,0.757898,International TV Shows,Romantic TV Shows,TV Comedies,International TV Shows,Romantic TV Shows,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11054,TV Show,United States,-1.585979,Action-Adventure,Animation,Kids,Action-Adventure,Animation,1,1,...,1,0,0,0,0,0,0,0,0,0
11055,TV Show,United States,-2.032431,Comedy,Coming of Age,Kids,Comedy,Coming of Age,0,0,...,0,0,0,1,0,0,0,0,0,0
11056,TV Show,United States,-0.023394,Action-Adventure,Comedy,Coming of Age,Action-Adventure,Comedy,0,1,...,0,0,0,0,0,0,0,0,0,0
11057,TV Show,Canada,0.199832,Action-Adventure,Comedy,Science Fiction,Action-Adventure,Comedy,0,0,...,0,0,0,0,0,0,0,0,0,0


### Words added to dataframe, successfully

## 3. Handle Categorical Data:

### One-hot Encoding VS Label Encoding:

Label Encoding is very simple and it involves converting each value in a column to a number.

Though label encoding is straight but it has the disadvantage that the numeric values can be misinterpreted by algorithms as having some sort of hierarchy/order in them. This ordering issue is addressed in another common alternative approach called ‘One-Hot Encoding’. In this strategy, each category value is converted into a new column and assigned a 1 or 0 (notation for true/false) value to the column.

resorce: https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd

### We use tree based algorithms and so use both of these Encodings create same result; so we use Label Encoding because it is more simple than One-hot Encoding.

In [37]:
label_encoder = LabelEncoder()

In [38]:
df

Unnamed: 0,type,country,release_year,genre0,genre1,genre2,genre3,genre4,the,and,...,david,attenborough,michael,john,james,lee,paul,kim,de,tom
0,Movie,United States,0.646285,Documentaries,Documentaries,Documentaries,Documentaries,Documentaries,2,1,...,1,1,0,0,0,0,0,0,0,0
1,TV Show,South Africa,0.757898,International TV Shows,TV Dramas,TV Mysteries,International TV Shows,TV Dramas,0,0,...,0,0,0,0,0,0,0,0,1,0
2,TV Show,United States,0.757898,Crime TV Shows,International TV Shows,TV Action & Adventure,Crime TV Shows,International TV Shows,0,2,...,0,0,0,0,0,0,0,0,0,0
3,TV Show,United States,0.757898,Docuseries,Reality TV,Docuseries,Reality TV,Docuseries,2,1,...,1,1,0,0,0,0,0,0,0,0
4,TV Show,India,0.757898,International TV Shows,Romantic TV Shows,TV Comedies,International TV Shows,Romantic TV Shows,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11054,TV Show,United States,-1.585979,Action-Adventure,Animation,Kids,Action-Adventure,Animation,1,1,...,1,0,0,0,0,0,0,0,0,0
11055,TV Show,United States,-2.032431,Comedy,Coming of Age,Kids,Comedy,Coming of Age,0,0,...,0,0,0,1,0,0,0,0,0,0
11056,TV Show,United States,-0.023394,Action-Adventure,Comedy,Coming of Age,Action-Adventure,Comedy,0,1,...,0,0,0,0,0,0,0,0,0,0
11057,TV Show,Canada,0.199832,Action-Adventure,Comedy,Science Fiction,Action-Adventure,Comedy,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
for title in df:
    if(df[title].dtype == "object"):
        df[title] = label_encoder.fit_transform(df[title])

In [40]:
df

Unnamed: 0,type,country,release_year,genre0,genre1,genre2,genre3,genre4,the,and,...,david,attenborough,michael,john,james,lee,paul,kim,de,tom
0,0,608,0.646285,22,19,20,21,21,2,1,...,1,1,0,0,0,0,0,0,0,0
1,1,430,0.757898,37,69,79,38,69,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,608,0.757898,20,37,75,19,39,0,2,...,0,0,0,0,0,0,0,0,0,0
3,1,608,0.757898,24,51,22,51,23,2,1,...,1,1,0,0,0,0,0,0,0,0
4,1,254,0.757898,37,54,76,38,55,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11054,1,608,-1.585979,2,4,39,2,5,1,1,...,1,0,0,0,0,0,0,0,0,0
11055,1,608,-2.032431,18,16,39,17,18,0,0,...,0,0,0,1,0,0,0,0,0,0
11056,1,608,-0.023394,2,15,17,2,17,0,1,...,0,0,0,0,0,0,0,0,0,0
11057,1,59,0.199832,2,15,62,2,17,0,0,...,0,0,0,0,0,0,0,0,0,0


### Label Encode non-numerical columns by sklearn LableEncoder

# Phase 2 : Predict the Goal and Model Optimization

# Phase 3 : Prediction with Group Learning