In [1]:
import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
datasets = ['ITV_total_muestra_live.csv', 'ITV_total_muestra_vod.csv', 
            'OPL_total_muestra_live.csv', 'OPL_total_muestra_vod.csv',
            'TVE_DTH_total_muestra_live.csv', 'TVE_DTH_total_muestra_vod.csv']
dfs = pd.DataFrame()

for dataset in range(len(datasets)): 

    df = pd.read_csv('Data/Extraction 3/' + datasets[dataset], sep="\t")
    dfs = pd.concat([dfs, df], ignore_index=True)

cols_names = dfs.columns.str.split('\t')
cols_names = [col for cols_names in cols_names for col in cols_names]

dfs.head()

Unnamed: 0,user_id,customer_id,subscription_id,unique_user_id,device_id,channel_call_letter,channel_name,channel_type,channel_subtype,program_id,...,profile_id,global_op_id,service_type,capture_day,duration,type,service,offset,buffering,commercialization_type
0,f1b9ff065d526bb021216d90a92455a2f036f26c74ac95...,,,83b2d24add3b48f96edfabfcab903501c5baaf14ab994b...,95faf0c14944c5c29d764cbdb5b74e41b49270b1686e1f...,CHV,CHV,,,267517334,...,0.0,CL,1003,2021-12-19,964,live,ITV,,,
1,f1b9ff065d526bb021216d90a92455a2f036f26c74ac95...,,,83b2d24add3b48f96edfabfcab903501c5baaf14ab994b...,95faf0c14944c5c29d764cbdb5b74e41b49270b1686e1f...,MEGA,MEGA,,,267606959,...,0.0,CL,1003,2021-12-19,1437,live,ITV,,,
2,f1b9ff065d526bb021216d90a92455a2f036f26c74ac95...,,,83b2d24add3b48f96edfabfcab903501c5baaf14ab994b...,95faf0c14944c5c29d764cbdb5b74e41b49270b1686e1f...,MEGA,MEGA,,,264213105,...,0.0,CL,1003,2021-12-19,3338,live,ITV,,,
3,68db52682dcf2f0dff42f36ce26abe2003d87ca76142b4...,,,a85d344894d64c861f7ec7589f6c93640c0e8d42f62ed9...,54e5d381955144c11f5960bdee3d5655ae3f56a0126b5a...,TVN,TVN,,,267607757,...,0.0,CL,1003,2021-12-19,120,live,ITV,,,
4,aabc1623e0307bbaf08c0e85fb507ba08f5f18fe8ad5fe...,,,b43d8a2daa729b5b1384679b73a81334297c90236556a3...,25db36c580cd8105ab3f24fecf8d71d694c006e1aab745...,CHV,CHV,,,267517392,...,0.0,CL,1003,2021-12-19,960,live,ITV,,,


In [3]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203065 entries, 0 to 203064
Data columns (total 41 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   user_id                 203065 non-null  object 
 1   customer_id             0 non-null       float64
 2   subscription_id         0 non-null       float64
 3   unique_user_id          203065 non-null  object 
 4   device_id               203065 non-null  object 
 5   channel_call_letter     200546 non-null  object 
 6   channel_name            200546 non-null  object 
 7   channel_type            0 non-null       float64
 8   channel_subtype         0 non-null       float64
 9   program_id              203065 non-null  int64  
 10  program_name            203065 non-null  object 
 11  normal_program_name     203065 non-null  object 
 12  program_theme           203065 non-null  object 
 13  program_subtheme        202711 non-null  object 
 14  date_time_start     

In [4]:
dfs['normal_program_name']

0                                La divina comida
1                              Especial de prensa
2                          Meganoticias actualiza
3                                     Chile Elige
4                El discípulo del chef - Lo mejor
                           ...                   
203060                         Un hombre previsor
203061                                    Gênesis
203062                       El Escuadrón Suicida
203063                       El Escuadrón Suicida
203064    Robin Hood: El príncipe de los ladrones
Name: normal_program_name, Length: 203065, dtype: object

In [5]:
# Drop NA columns
dfs = dfs.drop(['customer_id', 'subscription_id', 'channel_type', 'channel_subtype'], axis=1)

# Change to datetime and integer type
dfs = dfs.astype({"date_time_start":"datetime64", "end_date_time":"datetime64",
                  "program_start":"datetime64", "program_end":"datetime64", "duration":"int"})

# Delete microseconds
dfs["date_time_start"] = dfs["date_time_start"].apply(lambda x: x.replace(microsecond=0))
dfs["end_date_time"] = dfs["end_date_time"].apply(lambda x: x.replace(microsecond=0))
dfs["program_start"] = dfs["program_start"].apply(lambda x: x.replace(microsecond=0))
dfs["program_end"] = dfs["program_end"].apply(lambda x: x.replace(microsecond=0))

# Convert from UTC Timezone to 
dfs["date_time_start"] = dfs["date_time_start"].dt.tz_localize('UTC').dt.tz_convert('Chile/Continental')
dfs["end_date_time"] = dfs["end_date_time"].dt.tz_localize('UTC').dt.tz_convert('Chile/Continental')
dfs["program_start"] = dfs["program_start"].dt.tz_localize('UTC').dt.tz_convert('Chile/Continental')
dfs["program_end"] = dfs["program_end"].dt.tz_localize('UTC').dt.tz_convert('Chile/Continental')

# Eliminate Timezone UTC-3 info from column
dfs["date_time_start"] = dfs["date_time_start"].dt.tz_localize(None)
dfs["end_date_time"] = dfs["end_date_time"].dt.tz_localize(None)
dfs["program_start"] = dfs["program_start"].dt.tz_localize(None)
dfs["program_end"] = dfs["program_end"].dt.tz_localize(None)

# Remove specific titles of program_name (ROBIN HOOD: EL PRÍNCIPE DE LOS LADRONES -> ROBIN HOOD)
dfs['program_name'] = dfs['program_name'].str.split(':').str[0]

dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203065 entries, 0 to 203064
Data columns (total 37 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   user_id                 203065 non-null  object        
 1   unique_user_id          203065 non-null  object        
 2   device_id               203065 non-null  object        
 3   channel_call_letter     200546 non-null  object        
 4   channel_name            200546 non-null  object        
 5   program_id              203065 non-null  int64         
 6   program_name            203065 non-null  object        
 7   normal_program_name     203065 non-null  object        
 8   program_theme           203065 non-null  object        
 9   program_subtheme        202711 non-null  object        
 10  date_time_start         203065 non-null  datetime64[ns]
 11  end_date_time           203065 non-null  datetime64[ns]
 12  program_start           200546

In [6]:
# for i in range(len(dfs.columns)):
#     if len(dfs.iloc[:,i].value_counts()) <2:
#         print(dfs.iloc[:,i].value_counts())
# #         display(dfs.iloc[:,i])
# #     print(dfs.iloc[:,i].name, len(dfs.iloc[:,i].value_counts()))
    
# # # # session_type, profile_id, global_op_id, global_op_name -> 1

In [7]:
# Drop useless columns
df_med = dfs.drop(['unique_user_id', 'device_id', 'global_op_name', 'day', 'audio_language', 
                   'subs_language', 'session_type', 'profile_id', 'global_op_id', 'capture_day',
                   'offset', 'buffering', 'commercialization_type', 'producer', 'distributor'], axis=1)

# Drop "repeated columns"
df_med = df_med.drop(['real_session_duration', 'real_start_time', 
                      'channel_call_letter', 'normal_program_name', 
                      'program_id'], axis=1)

# Drop not useful columns
df_med = df_med.drop(['service_source', 'service_subtype', 'service_type'], axis=1)

    - user_id, **unique_user_id**, **device_id** -> Personal id, not really useful for model and identification in viz not necessary
    - program_id -> Might not be useful if we already have the program name (although they not same in values)
    - program_start, program_end
    - service_name -> May be possible to extract it from device_type_used (Maybe hierarchical?)
    - service_source, service_subtype, service_type -> do not really know what it is (have to look into it but does not look good since are not really populated)

- **global_op_name**, **session_type**, **profile_id**, **global_op_id** -> Only one value (only useful if different in other countries)
- user_id, **unique_user_id**, **device_id** -> Personal id, not really useful for model and identification in viz not necessary
- channel_name vs **channel_call_letter** -> Both should be same, but actually not same. (only one stays? Better may be channel_name)
- program_id -> Might not be useful if we already have the program name (although they not same in values)
- program_name vs **normal_program_name** -> Both same, only one stays (Better may be program_name)
- **real_start_time** vs date_time_start -> Does not make sense to keep real_start_time if almost same (Have to check for similarity)
- program_start, program_end
- **real_session_duration** vs duration -> Both same, only one stays (Better may be duration)
- service_name -> May be possible to extract it from device_type_used (Maybe hierarchical?)
- **day**, **capture_day** -> already have variables with day and time
- **audio_language**, **subs_language** -> not really populated (could be useful if we actually had it)
- service_source, service_subtype, service_type, **offset**, **buffering**, **commerzialization_type**, **producer**, **distributor** -> do not really know what it is (have to look into it but does not look good since are not really populated)

In [8]:
# df_med['duration'].sort_values(ascending=False).head(10)

# df_mask=df_med['duration']>=12600
# filtered_df = df_med[df_mask]
# display(filtered_df)
# filtered_df.iloc[:,12].value_counts()

In [9]:
# Dropping samples where the duration is less than 1 minute or larger than 3 hours
df_tot = df_med.drop(df_med[df_med['duration'] > 12600].index)
df_tot = df_tot.drop(df_tot[df_tot['duration'] < 60].index)

In [10]:
# Make classification of device_type_used column values into more general classification
df_tot.loc[df_tot['device_type_used'].isin(['tvLg_no-Accedo', 'tvSamsung_2017+', 
                                            'tvAndroidTv', 'tvSamsung', 'tvLg', 
                                            'tvPanasonic', 'tvPhilips', 'tvSony']), 'device_type_used'] = 'TV'
df_tot.loc[df_tot['device_type_used'].isin(['stbHybridZapperCable', 'stbProteusCableUHD', 
                                            'stbProteusCableHD', 'stbHybridZapperCable1Gb', 
                                            'stbHybridPVRCable', 'stbHybridZapperSat']), 'device_type_used'] = 'STB CATV'
df_tot.loc[df_tot['device_type_used'].isin(['sphAndroid', 'sphApple']), 'device_type_used'] = 'SMARTPHONE'
df_tot.loc[df_tot['device_type_used'].isin(['tabApple', 'tabAndroid', 'tabWin']), 'device_type_used'] = 'TABLET'
df_tot.loc[df_tot['device_type_used'].isin(['Chromecast']), 'device_type_used'] = 'STREAMER'
df_tot.loc[df_tot['device_type_used'].isin(['stbAndroidTv', 'FireTV']), 'device_type_used'] = 'STB OTT'
df_tot.loc[df_tot['device_type_used'].isin(['OpenIPTV_STB', 'Mediaroom']), 'device_type_used'] = 'STB IPTV'

# Change names
df_tot.loc[df_tot['program_theme'].isin(['MSEPG_SPECIAL']), 'program_theme'] = 'Special'
df_tot.loc[df_tot['program_theme'].isin(['MSEPG_NEWS']), 'program_theme'] = 'News'
df_tot.loc[df_tot['program_theme'].isin(['MSEPG_MOVIE']), 'program_theme'] = 'Movie'
df_tot.loc[df_tot['program_theme'].isin(['MSEPG_SPORTS']), 'program_theme'] = 'Sports'
df_tot.loc[df_tot['program_theme'].isin(['MSEPG_SERIES']), 'program_theme'] = 'Series'
df_tot.loc[df_tot['program_theme'].isin(['MSEPG_KIDS']), 'program_theme'] = 'Kids'
df_tot.loc[df_tot['program_theme'].isin(['MSEPG_PaidProgramming']), 'program_theme'] = 'Paid'
df_tot.loc[df_tot['program_theme'].isin(['MSEPG_SHORTFILM']), 'program_theme'] = 'Shortfilm'
df_tot.loc[df_tot['program_theme'].isin(['Movie']), 'program_theme'] = 'Movie'
df_tot.loc[df_tot['program_theme'].isin(['Episode']), 'program_theme'] = 'Series'

# Fill NA values
df_tot.fillna(value={'channel_name':'VOD'}, inplace=True)
df_tot['program_start'].fillna(df_tot['date_time_start'], inplace=True)
df_tot['program_end'].fillna(df_tot['end_date_time'], inplace=True)
df_tot['program_subtheme'].fillna(df_tot['program_theme'], inplace=True)

# Reset dataframe indexes
df_tot.reset_index(drop=True, inplace=True)

In [11]:
df_tot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202820 entries, 0 to 202819
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   user_id           202820 non-null  object        
 1   channel_name      202820 non-null  object        
 2   program_name      202820 non-null  object        
 3   program_theme     202820 non-null  object        
 4   program_subtheme  202820 non-null  object        
 5   date_time_start   202820 non-null  datetime64[ns]
 6   end_date_time     202820 non-null  datetime64[ns]
 7   program_start     202820 non-null  datetime64[ns]
 8   program_end       202820 non-null  datetime64[ns]
 9   device_type_used  202820 non-null  object        
 10  service_name      202820 non-null  object        
 11  duration          202820 non-null  int32         
 12  type              202820 non-null  object        
 13  service           202820 non-null  object        
dtypes: d

In [12]:
df_tot.head(5)

Unnamed: 0,user_id,channel_name,program_name,program_theme,program_subtheme,date_time_start,end_date_time,program_start,program_end,device_type_used,service_name,duration,type,service
0,f1b9ff065d526bb021216d90a92455a2f036f26c74ac95...,CHV,LA DIVINA COMIDA,Special,"Reality,Culinary",2021-12-18 22:31:22,2021-12-18 22:47:26,2021-12-18 22:30:00,2021-12-19 01:15:00,TV,Go,964,live,ITV
1,f1b9ff065d526bb021216d90a92455a2f036f26c74ac95...,MEGA,ESPECIAL DE PRENSA,Special,Interview,2021-12-19 19:57:34,2021-12-19 20:21:31,2021-12-19 15:30:00,2021-12-19 20:30:00,TV,Go,1437,live,ITV
2,f1b9ff065d526bb021216d90a92455a2f036f26c74ac95...,MEGA,MEGANOTICIAS ACTUALIZA,News,"Interview,Politics,News",2021-12-19 13:01:22,2021-12-19 13:57:00,2021-12-19 13:00:00,2021-12-19 15:30:00,TV,Go,3338,live,ITV
3,68db52682dcf2f0dff42f36ce26abe2003d87ca76142b4...,TVN,CHILE ELIGE,News,"Interview,Politics",2021-12-19 20:18:51,2021-12-19 20:20:51,2021-12-19 15:00:00,2021-12-19 21:00:00,TV,Go,120,live,ITV
4,aabc1623e0307bbaf08c0e85fb507ba08f5f18fe8ad5fe...,CHV,EL DISCÍPULO DEL CHEF - LO MEJOR,Special,"Reality,Culinary",2021-12-19 01:48:00,2021-12-19 02:04:00,2021-12-19 01:15:00,2021-12-19 02:20:00,TV,Go,960,live,ITV


## Create Subthemes dataset

In [13]:
# Create another DataFrame to have only 1 subtheme per row
df_subthemes = df_tot[['user_id', 'channel_name', 'program_name','program_theme', 'program_subtheme','date_time_start', 'duration']]

df_subthemes['program_subtheme'] = df_subthemes.program_subtheme.str.split(',')
df_subthemes = df_subthemes.explode('program_subtheme')
df_subthemes.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subthemes['program_subtheme'] = df_subthemes.program_subtheme.str.split(',')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 423777 entries, 0 to 202819
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   user_id           423777 non-null  object        
 1   channel_name      423777 non-null  object        
 2   program_name      423777 non-null  object        
 3   program_theme     423777 non-null  object        
 4   program_subtheme  423777 non-null  object        
 5   date_time_start   423777 non-null  datetime64[ns]
 6   duration          423777 non-null  int32         
dtypes: datetime64[ns](1), int32(1), object(5)
memory usage: 24.2+ MB


## Download datasets

In [14]:
df_tot.to_csv('Data/Extraction 3/Clean data 4.csv',index=True)
df_subthemes.to_csv('Data/Extraction 3/Subthemes.csv',index=True)