## Basic Stats about data

In [1]:
#importing essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#reading user data
user_data = pd.read_csv('user.csv')

In [3]:
#printing first 10 rows
user_data.head(10)

Unnamed: 0,user_id,user_age,gender,location,joining_date
0,user_45028@domain.com,40,M,Goa,2018-06-18
1,user_95235@domain.com,55,F,Andhra Pradesh,2019-09-17
2,user_44289@domain.com,38,F,Goa,2018-09-03
3,user_23708@domain.com,47,F,West Bengal,2020-11-15
4,user_40584@domain.com,24,M,Tamil Nadu,2018-02-06
5,user_18216@domain.com,52,F,Gujarat,2020-02-12
6,user_97032@domain.com,46,F,Karnataka,2020-07-17
7,user_64366@domain.com,53,F,Uttar Pradesh,2019-10-08
8,user_71452@domain.com,18,M,Maharashtra,2020-09-26
9,user_76063@domain.com,20,M,Telangana,2020-07-12


In [4]:
print("number of rows in user_data is {}".format(user_data.shape[0]))
print("number of columns in user_data is {}".format(user_data.shape[1]))

number of rows in user_data is 13843
number of columns in user_data is 5


In [5]:
#reading content data
content_data = pd.read_csv('content.csv')
content_data.head(10)

Unnamed: 0,content_id,content_type,language,genre,duration,release_date,rating,episode_count,season_count
0,cont_475_19_32,series,english,drama,4980000,2018-07-01,10,32,19
1,cont_2185_15_21,series,english,drama,3000000,2016-03-29,4,21,15
2,cont_4857_13_28,series,tamil,comedy,3120000,2006-03-06,8,28,13
3,cont_3340_1_5,sports,hindi,cricket,9900000,2009-01-10,0,5,1
4,cont_1664_10_29,series,hindi,action,3660000,2020-05-25,2,29,10
5,cont_51_1_37,series,hindi,comedy,3060000,2002-02-04,10,37,1
6,cont_2208_1_24,series,marathi,drama,3600000,2011-07-09,7,24,1
7,cont_2679_5_17,series,hindi,comedy,3240000,1994-05-03,7,17,5
8,cont_4790_4_19,series,marathi,drama,3060000,2020-12-18,1,19,4
9,cont_1437_5_25,series,hindi,comedy,2880000,2002-05-27,1,25,5


In [6]:
#renaming the column duration to content_duration
content_data.rename(columns={'duration':'content_duration'},inplace = True)

In [7]:
print("number of rows in content_data is {}".format(content_data.shape[0]))
print("number of columns in content_data is {}".format(content_data.shape[1]))

number of rows in content_data is 48645
number of columns in content_data is 9


## Make dataset ready using relationship.csv

In [8]:
#reading relationship.csv
relation_data = pd.read_csv('relationship.csv')
relation_data.head(10)

Unnamed: 0,user_id,content_id,duration,date,start_time,end_time
0,user_44902@domain.com,cont_1718_16_7,1920000,2020-06-17,06:19:13,06:51:13
1,user_87018@domain.com,cont_825_1_3,1080000,2019-12-06,19:58:14,20:16:14
2,user_53430@domain.com,cont_1981_2_26,180000,2020-07-27,01:52:51,01:55:51
3,user_53696@domain.com,cont_1072_18_7,1440000,2021-08-24,06:34:14,06:58:14
4,user_80251@domain.com,cont_760_9_27,960000,2020-06-15,16:57:19,17:13:19
5,user_52275@domain.com,cont_480_1_4,3000000,2021-08-21,18:07:58,18:57:58
6,user_99776@domain.com,cont_2719_6_11,1980000,2021-02-26,19:53:59,20:26:59
7,user_10182@domain.com,cont_2194_3_26,2760000,2021-03-23,15:06:17,15:52:17
8,user_90503@domain.com,cont_3023_2_23,840000,2020-08-31,05:11:41,05:25:41
9,user_13848@domain.com,cont_3815_1_5,4740000,2019-10-22,00:21:15,01:40:15


In [9]:
relation_data.shape

(1654450, 6)

In [10]:
#inner join on user_data and relation_data to get all columns of user_data
merged_df = pd.merge(user_data,relation_data,how = 'inner',on = 'user_id')

In [11]:
merged_df.columns

Index(['user_id', 'user_age', 'gender', 'location', 'joining_date',
       'content_id', 'duration', 'date', 'start_time', 'end_time'],
      dtype='object')

In [12]:
#inner join on merged_data and content_data
merged_final = pd.merge(merged_df,content_data,how = 'inner',on = "content_id")
merged_final.shape

(1654450, 18)

In [13]:
#final dataset columns
merged_final.columns

Index(['user_id', 'user_age', 'gender', 'location', 'joining_date',
       'content_id', 'duration', 'date', 'start_time', 'end_time',
       'content_type', 'language', 'genre', 'content_duration', 'release_date',
       'rating', 'episode_count', 'season_count'],
      dtype='object')

# Exploratory Data Analysis

In [14]:
merged_final.head(5)

Unnamed: 0,user_id,user_age,gender,location,joining_date,content_id,duration,date,start_time,end_time,content_type,language,genre,content_duration,release_date,rating,episode_count,season_count
0,user_44289@domain.com,38,F,Goa,2018-09-03,cont_3375_16_10,2220000,2020-06-03,18:47:17,19:24:17,series,english,action,3060000,2015-11-16,5,10,16
1,user_29171@domain.com,46,M,Odisa,2019-06-18,cont_3375_16_10,1980000,2021-06-24,00:30:37,01:03:37,series,english,action,3060000,2015-11-16,5,10,16
2,user_91037@domain.com,16,F,Maharashtra,2017-10-26,cont_3375_16_10,2580000,2018-04-02,22:56:40,23:39:40,series,english,action,3060000,2015-11-16,5,10,16
3,user_78324@domain.com,53,F,Andhra Pradesh,2018-03-16,cont_3375_16_10,420000,2020-12-02,12:23:56,12:30:56,series,english,action,3060000,2015-11-16,5,10,16
4,user_69522@domain.com,21,M,Karnataka,2019-06-30,cont_3375_16_10,1380000,2021-03-29,21:19:41,21:42:41,series,english,action,3060000,2015-11-16,5,10,16


In [15]:
merged_final.describe()

Unnamed: 0,user_age,duration,content_duration,rating,episode_count,season_count
count,1654450.0,1654450.0,1654450.0,1654450.0,1654450.0,1654450.0
mean,38.12572,1812842.0,3567260.0,4.970632,15.86375,6.620199
std,12.90478,1165702.0,947931.8,3.132943,12.5111,6.197996
min,16.0,60000.0,60000.0,0.0,0.0,0.0
25%,27.0,900000.0,3000000.0,2.0,6.0,2.0
50%,38.0,1740000.0,3360000.0,5.0,13.0,5.0
75%,49.0,2580000.0,3840000.0,8.0,23.0,9.0
max,60.0,10860000.0,11100000.0,10.0,60.0,44.0


## observations 

### 1.From above table user age lies between 16 to 60 and mean age of the user is 38
### 2.On an average user watch hours is 30min approx  , minimum watch hours is 1min approx ,maximum watch hour is 181 min.
### 3.on an average content_duration is 60 min approx, minimum watch duration is 1 min ,maximum duration is 185 minutes.
### 4. on an average user rating is 5 and minimum rating given by user is 0 and maximum is 10.
### 5. On an average there are 16 episodes and maximum episodes are 60.
### 6. on an average there are 6 seasons and maximum 44 seasons.
### 7. since all mean and medians i.e 50 percentiles are almost same there are no outliers present. but we can checkout further with box plots.

In [16]:
#info about data
merged_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1654450 entries, 0 to 1654449
Data columns (total 18 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   user_id           1654450 non-null  object
 1   user_age          1654450 non-null  int64 
 2   gender            1654450 non-null  object
 3   location          1654450 non-null  object
 4   joining_date      1654450 non-null  object
 5   content_id        1654450 non-null  object
 6   duration          1654450 non-null  int64 
 7   date              1654450 non-null  object
 8   start_time        1654450 non-null  object
 9   end_time          1654450 non-null  object
 10  content_type      1654450 non-null  object
 11  language          1654450 non-null  object
 12  genre             1654450 non-null  object
 13  content_duration  1654450 non-null  int64 
 14  release_date      1654450 non-null  object
 15  rating            1654450 non-null  int64 
 16  episode_count     

There are no null values present in it

In [17]:
#found out that there are episode count and season count are zeros
#season count and episode count can not be zero
#so removing the rows in which season and episode count are zeros
merged_final[merged_final['season_count'] == 0 ].shape

(6945, 18)

There are 6945 rows which are season count as zero

In [18]:
merged_final.drop(merged_final[merged_final['season_count'] == 0 ].index,inplace = True)


In [19]:
#after removing the rows shape will be
merged_final.shape

(1647505, 18)

In [20]:
#checking if any duplicate rows present or not
merged_final.duplicated().sum()

0

In [21]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [22]:
def vectorize_categorical(data):
    o = OneHotEncoder()
    vect = o.fit_transform(data.values.reshape(-1,1))
    return vect

def vectorize_numerical(data):
    s = StandardScaler()
    vect = s.fit_transform(data.values.reshape(-1,1))
    return vect

Taking only certain columns which are useful in recommendation

In [23]:
#vectorizing the faetures
#user_age
merged_final['user_age'].describe()

count    1.647505e+06
mean     3.812545e+01
std      1.290479e+01
min      1.600000e+01
25%      2.700000e+01
50%      3.800000e+01
75%      4.900000e+01
max      6.000000e+01
Name: user_age, dtype: float64

In [24]:
#user_age
vect_age = vectorize_numerical(merged_final['user_age'])

In [25]:
#location
merged_final['location'].describe()

count       1647505
unique           29
top       Karnataka
freq         226580
Name: location, dtype: object

In [26]:
vect_location = vectorize_categorical(merged_final['location'])

In [27]:
#duration
merged_final['duration'].describe()

count    1.647505e+06
mean     1.810053e+06
std      1.157088e+06
min      6.000000e+04
25%      9.000000e+05
50%      1.740000e+06
75%      2.580000e+06
max      1.086000e+07
Name: duration, dtype: float64

In [28]:
vect_duration = vectorize_numerical(merged_final['duration'])

In [29]:
#content_type
merged_final['content_type'].describe()
vect_content_type = vectorize_categorical(merged_final['content_type'])

In [30]:
#language
merged_final['language'].describe()
vect_language = vectorize_categorical(merged_final['language'])

In [31]:
vect_language.shape

(1647505, 11)

In [32]:
#genre
merged_final['genre'].describe()
vect_genre = vectorize_categorical(merged_final['genre'])

In [39]:
#rating
merged_final['rating'].describe()
vect_rating = vectorize_categorical(merged_final['rating'])

In [34]:
#content_duration
merged_final['content_duration'].describe()
vect_content_duration = vectorize_numerical(merged_final['content_duration'])

In [35]:
#episode_count
merged_final['episode_count'].describe()
vect_episode_count = vectorize_numerical(merged_final['episode_count'])

In [36]:
#season_count
merged_final['season_count'].describe()
vect_season_count = vectorize_numerical(merged_final['season_count'])

In [37]:
vect_season_count.shape

(1647505, 1)

In [40]:
#stacking all features

x = np.hstack((vect_age,vect_duration,vect_rating.todense(),vect_content_duration,vect_episode_count,vect_season_count,vect_location.todense(),vect_content_type.todense(),vect_language.todense(),vect_genre.todense()))

In [41]:
x.shape

(1647505, 70)

In [None]:
#computing cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(data_x)

computing the cosine similarity matrix for the above matrix is very expensive and takes lot of time


so one hack we can use is will compute the similarity matrix for particular user at run time whenever we need it.

and store the user similarity matrix in dictionary of dictionary

i.e {
            'user_id':
                            
                             {
                             
                             
                             'user_id' : cosine_sim_value
                             
                             }
                             
       }

In [47]:
merged_final[merged_final['user_id'] == 'user_18085@domain.com'].index

Int64Index([  45227,   58430,   60310,   61681,  164551,  171425,  174977,
             252368,  255612,  256618,
            ...
            1575643, 1575662, 1575671, 1575682, 1575695, 1575714, 1575730,
            1575745, 1575759, 1575777],
           dtype='int64', length=269)

In [52]:
merged_final.loc[58430,:]

user_id             user_18085@domain.com
user_age                               40
gender                                  M
location                        Telangana
joining_date                   2018-12-08
content_id                 cont_3036_5_26
duration                          2220000
date                           2020-04-11
start_time                       21:56:19
end_time                         22:33:19
content_type                       series
language                           telugu
genre                               drama
content_duration                  3780000
release_date                   2004-10-02
rating                                  8
episode_count                          26
season_count                            5
Name: 58430, dtype: object

In [None]:
#recommendation
def recommend(user_id):
    #fetch the index of user_id from original dataframe
    index = merged_final.index(user_id)
    
    #featch particular row from sim matrix
    row = sim_matrix[index]
    
    r = range(len(sim_matrix[0]))
    di = dict(zip(row,r))
    
    sorted_row = sorted(row,ascending = False)
    fetch_10 = sorted_row[:10]
    
    fetch_10_index  = []
    for i in fetch_10:
        val = di.get(i)
        fetch_10_index.append(val)
        
    list_of_contents = []
    for j in featch_10_index:
        cont_val = merged_final[j]['content_id']
        list_of_contents.append(cont_val)
        
    return list_of_contents

In [None]:
#creating submission.json file
import json
test = pd.read_csv('test.csv')
li_test = test['user_id']
submission = dict()
for i in li_test:
    li = recommend(i)
    submission[i] = li
    
with open('submission.json','w') as file:
    json.dump(submission,file)
    
data_frame = pd.DataFrame(submission,columns = ['user_id','contents'])
data_frame.to_csv('submission.csv')
