In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Load the dataset
df = pd.read_csv("train.csv")

In [4]:
# 1. Basic Data Inspection
print("Dataset Shape:", df.shape)
print("\nColumn Info:")
print(df.info())
print("\nFirst 5 Rows:")
print(df.head())

Dataset Shape: (750000, 12)

Column Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), i

In [5]:
# 2. Missing Values Analysis
print("\nMissing Values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


Missing Values:
Episode_Length_minutes          87093
Guest_Popularity_percentage    146030
Number_of_Ads                       1
dtype: int64


In [6]:
# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.savefig('missing_values_heatmap.png')
plt.close()

In [7]:
# 3. Summary Statistics for Numerical Columns
numerical_cols = ['Episode_Length_minutes', 'Host_Popularity_percentage', 
                 'Guest_Popularity_percentage', 'Number_of_Ads', 'Listening_Time_minutes']
print("\nSummary Statistics for Numerical Columns:")
print(df[numerical_cols].describe())


Summary Statistics for Numerical Columns:
       Episode_Length_minutes  Host_Popularity_percentage  \
count           662907.000000               750000.000000   
mean                64.504738                   59.859901   
std                 32.969603                   22.873098   
min                  0.000000                    1.300000   
25%                 35.730000                   39.410000   
50%                 63.840000                   60.050000   
75%                 94.070000                   79.530000   
max                325.240000                  119.460000   

       Guest_Popularity_percentage  Number_of_Ads  Listening_Time_minutes  
count                603970.000000  749999.000000           750000.000000  
mean                     52.236449       1.348855               45.437406  
std                      28.451241       1.151130               27.138306  
min                       0.000000       0.000000                0.000000  
25%                      28

In [8]:
# 4. Distribution of Numerical Variables
for col in numerical_cols:
    plt.figure(figsize=(8, 5))
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.savefig(f'distribution_{col}.png')
    plt.close()

In [9]:
# 5. Categorical Variables Analysis
categorical_cols = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
for col in categorical_cols:
    print(f"\nValue Counts for {col}:")
    print(df[col].value_counts())
    
    plt.figure(figsize=(10, 5))
    sns.countplot(x=col, data=df, order=df[col].value_counts().index)
    plt.title(f"Count of {col}")
    plt.xticks(rotation=45)
    plt.savefig(f'count_{col}.png')
    plt.close()


Value Counts for Genre:
Genre
Sports        87606
Technology    86256
True Crime    85059
Lifestyle     82461
Comedy        81453
Business      80521
Health        71416
News          63385
Music         62743
Education     49100
Name: count, dtype: int64

Value Counts for Publication_Day:
Publication_Day
Sunday       115946
Monday       111963
Friday       108237
Wednesday    107886
Thursday     104360
Saturday     103505
Tuesday       98103
Name: count, dtype: int64

Value Counts for Publication_Time:
Publication_Time
Night        196849
Evening      195778
Afternoon    179460
Morning      177913
Name: count, dtype: int64

Value Counts for Episode_Sentiment:
Episode_Sentiment
Neutral     251291
Negative    250116
Positive    248593
Name: count, dtype: int64


In [10]:
# 6. Target Variable vs Categorical Variables (Boxplots)
for col in categorical_cols:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=col, y='Listening_Time_minutes', data=df)
    plt.title(f"Listening Time vs {col}")
    plt.xticks(rotation=45)
    plt.savefig(f'boxplot_listening_time_vs_{col}.png')
    plt.close()

In [11]:
# 7. Correlation Analysis for Numerical Variables
plt.figure(figsize=(10, 8))
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Heatmap of Numerical Variables")
plt.savefig('correlation_heatmap.png')
plt.close()

In [12]:
# 8. Scatter Plot: Episode Length vs Listening Time
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Episode_Length_minutes', y='Listening_Time_minutes', hue='Genre', size='Number_of_Ads', data=df)
plt.title("Episode Length vs Listening Time by Genre and Number of Ads")
plt.savefig('scatter_episode_length_vs_listening_time.png')
plt.close()

  plt.savefig('scatter_episode_length_vs_listening_time.png')


In [13]:
# 9. Groupby Analysis: Average Listening Time by Genre and Publication Day
print("\nAverage Listening Time by Genre:")
print(df.groupby('Genre')['Listening_Time_minutes'].mean().sort_values())
print("\nAverage Listening Time by Publication Day:")
print(df.groupby('Publication_Day')['Listening_Time_minutes'].mean().sort_values())


Average Listening Time by Genre:
Genre
News          44.406144
Comedy        44.426441
Sports        44.935722
Lifestyle     45.523117
Business      45.538151
Technology    45.634749
Education     45.736640
Health        45.741413
True Crime    46.042507
Music         46.578394
Name: Listening_Time_minutes, dtype: float64

Average Listening Time by Publication Day:
Publication_Day
Sunday       44.817398
Thursday     44.869706
Friday       45.206591
Saturday     45.326775
Wednesday    45.807177
Monday       45.969630
Tuesday      46.131411
Name: Listening_Time_minutes, dtype: float64
