In [97]:
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
from scipy.stats import chi2_contingency

In [2]:
df=pd.read_csv("c:/Assignmentt/traffic/traffic.csv")

In [13]:
# Get basic information about the dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226278 entries, 0 to 226277
Data columns (total 9 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   event    226278 non-null  object
 1   date     226278 non-null  object
 2   country  226267 non-null  object
 3   city     226267 non-null  object
 4   artist   226241 non-null  object
 5   album    226273 non-null  object
 6   track    226273 non-null  object
 7   isrc     219157 non-null  object
 8   linkid   226278 non-null  object
dtypes: object(9)
memory usage: 15.5+ MB


In [14]:
# Check for missing values
df.isnull().sum()

event         0
date          0
country      11
city         11
artist       37
album         5
track         5
isrc       7121
linkid        0
dtype: int64

In [23]:
#drop missing values
df=df.dropna()


In [38]:
#change datatype of date column
df["date"]=pd.to_datetime(df["date"],errors='coerce')

### Total and Daily Pageview Events:

1. How many total pageview events did the links in the dataset receive during the entire period?

##### Total pageview events

In [44]:
pageviews=df[df["event"]=="pageview"]
total_pageviews=pageviews.shape[0]
print(f"Total pageview events: {total_pageviews}")

Total pageview events: 137115


2.What is the average number of pageview events per day?

##### Average pageview events per day

In [52]:
avg_daily_pageviews= pageviews.groupby('date').size().mean()
print(f"Average daily pageview events" ,round(avg_daily_pageviews))

Average daily pageview events 19588


### Analysis of Other Events:

##### What is the total count and distribution of other recorded events in the dataset?

##### count all events

In [59]:
distribution_of_event=df.event.value_counts()
print("distribution_of_event")
print(distribution_of_event)

distribution_of_event
event
pageview    137115
click        53504
preview      28530
Name: count, dtype: int64


### Geographical Distribution:

##### Which countries contributed to the pageviews?

In [64]:
geo_distribution=pageviews['country'].value_counts()
print("Geographical distribution of pageviews:")
# print(geo_distribution.head(5))
print(geo_distribution)

Geographical distribution of pageviews:
country
Saudi Arabia                28597
India                       27285
United States               17311
France                       9658
Iraq                         4830
                            ...  
Guinea-Bissau                   1
Central African Republic        1
Guernsey                        1
Sint Maarten                    1
Saint Martin                    1
Name: count, Length: 211, dtype: int64


### Click-Through Rate (CTR) Analysis:

##### What is the overall click-through rate (CTR) calculated as clicks/pageviews?

In [74]:
click=df[df['event']=='click']
#calculate overall CTR
total_click=click.shape[0]
overall_CTR=total_click/total_pageviews
print("Overall click through rate CTR :",overall_CTR)

Overall click through rate CTR : 0.3902125952667469


##### How does the CTR vary across different links?

In [75]:
ctr_by_link = click.groupby('linkid')['event'].count() / pageviews.groupby('linkid')['event'].count()
print("CTR by link:")
print(ctr_by_link)

CTR by link:
linkid
006af6a0-1f0d-4b0c-93bf-756af9071c06    0.222222
00759b81-3f04-4a61-b934-f8fb3185f4a0    0.750000
00829040-ee01-4409-966d-d67c7965144a    0.564103
009193ee-c3df-4efa-88f2-feb37c0bfdf2    0.666667
00de7566-f014-4d20-8616-82e4dea45b88    0.200000
                                          ...   
fe8f7a23-be9d-49a6-b9b5-d26823c3f911    0.406593
ff685183-215d-4729-9429-80f087eb6ce8         NaN
ffa88c9a-4e1b-42cd-93a9-0972179c7d02    0.666667
ffd3c9e7-c5c5-4f28-b03d-cbaec33f2152    0.881818
ffd8d5a7-91bc-48e1-a692-c26fca8a8ead    0.345238
Name: event, Length: 743, dtype: float64


### Correlation Analysis:

##### Is there a correlation between clicks and previews on a link? If so, is it statistically significant, and how strong is the effect?

In [77]:
link_summary=df.groupby('linkid').agg({"event":"count","country":"first"}).reset_index()

In [80]:
link_summary.rename(columns={"event":"total_events"},inplace=True)

In [85]:
link_summary["clicks"]= link_summary["total_events"]*(df["event"]=="click").mean()
link_summary["pageviews"]= link_summary["total_events"]*(df["event"]=="pageview").mean()

In [87]:
#correlation Analysis

In [101]:
correlation,p_value=pearsonr(link_summary["clicks"],link_summary["pageviews"])
print(f"Correlation between clicks and pageviews: {correlation}, p-value: {p_value}")


Correlation between clicks and pageviews: 1.0, p-value: 0.0


##### Correlation Coefficient: 1.0: This indicates a perfect positive correlation between clicks and pageviews.
##### A p-value of 0.0 (or very close to it) indicates that it is highly unlikely that this result is due to random chance.

##### Perform tests for both potential linear relationships and categorical (binary) relationships between these variables.

In [96]:
# Define X and Y for linear regression
X=link_summary['pageviews']
Y=link_summary['clicks']
X=sm.add_constant(X)
model=sm.OLS(Y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 clicks   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 3.382e+32
Date:                Sun, 22 Sep 2024   Prob (F-statistic):               0.00
Time:                        01:13:18   Log-Likelihood:                 19841.
No. Observations:                 743   AIC:                        -3.968e+04
Df Residuals:                     741   BIC:                        -3.967e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       9.219e-14   2.28e-14      4.043      0.0

##### check relationships among categorical variables


In [99]:
contingency_table=pd.crosstab(df["event"],df["country"])
chi2, p, dof, expected=chi2_contingency(contingency_table)
print(f"Chi-squared statistic: {chi2}, p-value: {p}")

Chi-squared statistic: 4823.237813060974, p-value: 0.0


###### Chi-squared Statistic: 4823.24: This is a high value, suggesting a significant association between the event types (clicks and pageviews) and the countries.
###### P-Value: 0.0: Similar to the correlation, this indicates that the relationship is statistically significant. The low p-value suggests that the observed distribution of clicks and pageviews across different countries is not due to random variation.