# Descriptive statistics

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

df= pd.read_csv("movies_and_shows.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Measurement of central tendency

In [None]:
df.Age.mean()

29.69911764705882

In [None]:
df.Age.median()

28.0

In [None]:
df.Pclass.mode()[0]

3

# Measurement of spread

In [None]:
df.Age.var()

211.01912474630802

In [None]:
df.Age.std()

14.526497332334042

In [None]:
# range
df.Age.max()-df.Age.min()

79.58

In [None]:
# IQR

df.Fare.quantile(0.75)- df.Fare.quantile(0.25)

23.0896

In [None]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
df.describe(include="object")

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


# Inferential statistics

Parameteric test

* t-test
* z test
* Anova test
* f-test

In [None]:
from scipy import stats

In [None]:
t_statistic, p_value = stats.ttest_ind(df[df['Survived'] == 1]['Age'].dropna(),
                                       df[df['Survived'] == 0]['Age'].dropna())
print("T-Test:")
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

T-Test:
T-Statistic: -2.06668694625381
P-Value: 0.03912465401348249


In [None]:
# Perform z-test
from statsmodels.stats.weightstats import ztest

z_statistic, p_value = ztest(df['Fare'].dropna(), value=32)
print("\nZ-Test:")
print("Z-Statistic:", z_statistic)
print("P-Value:", p_value)

alpha = 0.05

# Compare the p-value to the significance level
if p_value < alpha:
    print("\nReject the null hypothesis.")
    print("There is enough evidence to suggest that the average Fare is different from 32.")
else:
    print("\nFail to reject the null hypothesis.")
    print("There is not enough evidence to suggest that the average Fare is different from 32.")


Z-Test:
Z-Statistic: 0.12266271558913089
P-Value: 0.9023741894934963

Fail to reject the null hypothesis.
There is not enough evidence to suggest that the average Fare is different from 32.


In [None]:
# Perform ANOVA test
anova_result = stats.f_oneway(df[df['Pclass'] == 1]['Fare'].dropna(),
                              df[df['Pclass'] == 2]['Fare'].dropna(),
                              df[df['Pclass'] == 3]['Fare'].dropna())
print("\nANOVA Test:")
print("F-Statistic:", anova_result.statistic)
print("P-Value:", anova_result.pvalue)


# Set the significance level (alpha)
alpha = 0.05

# Compare the p-value to the significance level
if anova_result.pvalue < alpha:
    print("\nReject the null hypothesis.")
    print("There is enough evidence to suggest that there are significant differences in the mean Fare among different Pclass levels.")
else:
    print("\nFail to reject the null hypothesis.")
    print("There is not enough evidence to suggest that there are significant differences in the mean Fare among different Pclass levels.")



ANOVA Test:
F-Statistic: 242.34415651744814
P-Value: 1.0313763209141171e-84

Reject the null hypothesis.
There is enough evidence to suggest that there are significant differences in the mean Fare among different Pclass levels.


In [None]:
# Perform F-test
f_statistic, p_value = stats.f_oneway(df[df['Survived'] == 1]['Fare'].dropna(),
                                      df[df['Survived'] == 0]['Fare'].dropna())
print("\nF-Test:")
print("F-Statistic:", f_statistic)
print("P-Value:", p_value)

# Set the significance level (alpha)
alpha = 0.05

# Compare the p-value to the significance level
if p_value < alpha:
    print("\nReject the null hypothesis.")
    print("There is enough evidence to suggest that there are significant differences in the mean Fare between Survived and Not Survived groups.")
else:
    print("\nFail to reject the null hypothesis.")
    print("There is not enough evidence to suggest that there are significant differences in the mean Fare between Survived and Not Survived groups.")



F-Test:
F-Statistic: 63.03076422804448
P-Value: 6.120189341921873e-15

Reject the null hypothesis.
There is enough evidence to suggest that there are significant differences in the mean Fare between Survived and Not Survived groups.


# Non Paramteric test

* chi squared test
* ks test

In [None]:
# Perform Chi-squared test
observed = pd.crosstab(df['Survived'], df['Pclass'])
print(observed)
chi2_stat, p_value, _, _ = stats.chi2_contingency(observed)
print("Chi-squared Test:")
print("Chi2 Statistic:", chi2_stat)
print("P-Value:", p_value)

# Set the significance level (alpha)
alpha = 0.05

# Compare the p-value to the significance level
if p_value < alpha:
    print("\nReject the null hypothesis.")
    print("There is enough evidence to suggest a significant association between Survived and Pclass.")
else:
    print("\nFail to reject the null hypothesis.")
    print("There is not enough evidence to suggest a significant association between Survived and Pclass.")


Pclass      1   2    3
Survived              
0          80  97  372
1         136  87  119
Chi-squared Test:
Chi2 Statistic: 102.88898875696056
P-Value: 4.549251711298793e-23

Reject the null hypothesis.
There is enough evidence to suggest a significant association between Survived and Pclass.


In [None]:
# Perform KS test
ks_stat, p_value = stats.kstest(df['Fare'].dropna(), 'norm')
print("\nKS Test:")
print("KS Statistic:", ks_stat)
print("P-Value:", p_value)

# Set the significance level (alpha)
alpha = 0.05

# Compare the p-value to the significance level
if p_value < alpha:
    print("\nReject the null hypothesis.")
    print("There is enough evidence to suggest that the 'Fare' variable does not follow a normal distribution.")
else:
    print("\nFail to reject the null hypothesis.")
    print("There is not enough evidence to suggest that the 'Fare' variable does not follow a normal distribution.")



KS Test:
KS Statistic: 0.9831349436254704
P-Value: 0.0

Reject the null hypothesis.
There is enough evidence to suggest that the 'Fare' variable does not follow a normal distribution.
