In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import plotly.express as px
import plotly.graph_objects as go

from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings("ignore")

In [None]:
Direct_Marketing=pd.read_csv("https://raw.githubusercontent.com/benvictoria17/MachineLearning/master/dataset/DirectMarketing.csv")
df=Direct_Marketing.copy()
df.head(25)

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df[df.duplicated() == True]

In [None]:
{column: list(df[column].unique()) for column in df.select_dtypes('object').columns}

In [None]:
df.isnull().values.any()

In [None]:
df.isnull().sum()

In [None]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data(df)

In [None]:
df.corr()

In [None]:
print("Correlation Matrix")
plt.rcParams['figure.figsize']=(8,6)
sns.heatmap(df.corr(),cmap='coolwarm',linewidths=.5,fmt=".2f",annot = True);

In [None]:
df.describe().T

In [None]:
df_Age=df['Age'].value_counts().to_frame().reset_index().rename(columns={'index':'Age','Age':'count'})
df_Age

In [None]:
df_Age=df['Age'].value_counts().to_frame().reset_index().rename(columns={'index':'Age','Age':'count'})


fig = go.Figure([go.Pie(labels=df_Age['Age'], values=df_Age['count'], pull=[0.2,0,0],hole=0.4)])

fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=12,insidetextorientation='radial')

fig.update_layout(title="Age Distribution",title_x=0.5)
fig.show()

In [None]:
df_Age=df['Age'].value_counts().to_frame().reset_index().rename(columns={'index':'Age','Age':'count'})

fig = go.Figure(go.Bar(
    x=df_Age['Age'],y=df_Age['count'],
    marker={'color': df_Age['count'], 
    'colorscale': 'Viridis'},  
    text=df_Age['count'],
    textposition = "outside",
))
fig.update_layout(title_text='Age Distribution',xaxis_title="Age",yaxis_title="Count ",title_x=0.5)
fig.show()

In [None]:
df_Age_Salary=df.groupby(by =['Age'])['Salary'].mean().to_frame().reset_index().rename(columns={'Salary':'AVG_Salary'})
df_Age_AmountSpent=df.groupby(by =['Age'])['AmountSpent'].mean().to_frame().reset_index().rename(columns={'Age':'Age1','AmountSpent':'AVG_AmountSpent'})
result = pd.concat([df_Age_Salary,df_Age_AmountSpent], axis=1)
result.drop(['Age1'],inplace=True,axis=1)
result["AVG_Salary"]=result["AVG_Salary"].map(lambda x:round(x,2))
result["AVG_AmountSpent"]=result["AVG_AmountSpent"].map(lambda x:round(x,2))
result

In [None]:
df_Age_Salary=df.groupby(by =['Age'])['Salary'].mean().to_frame().reset_index().rename(columns={'Salary':'AVG_Salary'})
df_Age_AmountSpent=df.groupby(by =['Age'])['AmountSpent'].mean().to_frame().reset_index().rename(columns={'Age':'Age1','AmountSpent':'AVG_AmountSpent'})
result = pd.concat([df_Age_Salary,df_Age_AmountSpent], axis=1)
result.drop(['Age1'],inplace=True,axis=1)
result["AVG_Salary"]=result["AVG_Salary"].map(lambda x:round(x,2))
result["AVG_AmountSpent"]=result["AVG_AmountSpent"].map(lambda x:round(x,2))


fig = make_subplots(rows=2, cols=1,
                   subplot_titles=("Age AVG Salary ",
                                   "Age AVG Amount Spent " ))  # Subplot titles
                                  

fig.add_trace(go.Bar(
    x=result['Age'],y=result['AVG_Salary'],
    name="Mean Salary",
    marker={'color': result['AVG_Salary'], 
    'colorscale': 'fall'},  
    text=result['AVG_Salary'],
    textposition = "inside"),
    row=1, col=1         
)
fig.add_trace(go.Bar(
    x=result['Age'],y=result['AVG_AmountSpent'],
    name="Mean Amount Spent",
    marker={'color': result['AVG_AmountSpent'], 
    'colorscale': 'fall'},  
    text=result['AVG_AmountSpent'],
    textposition = "inside"),
    row=2, col=1         
)

fig.update_layout(title = "Age ",title_x=0.5)
fig.show()

In [None]:
# Scatter plot - Category

fig = px.scatter(df, x='Salary', y='AmountSpent',trendline="ols",
                 color='Age') # Added color to basic scatter
fig.update_layout(title='Age With Salary Vs Amount Spent ',xaxis_title="Salary",yaxis_title="Amount Spent",title_x=0.5)
fig.show()

In [None]:
df_Gender=df['Gender'].value_counts().to_frame().reset_index().rename(columns={'index':'Gender','Gender':'count'})
df_Gender

In [None]:
fig = go.Figure(go.Bar(
    x=df_Gender['Gender'],y=df_Gender['count'],
    marker={'color': df_Gender['count'], 
    'colorscale': 'Viridis'},  
    text=df_Gender['count'],
    textposition = "outside",
))
fig.update_layout(title_text='Gender Distribution',xaxis_title="Gender",yaxis_title="Count ",title_x=0.5)
fig.show()

In [None]:
df_Gender_Salary=df.groupby(by =['Gender'])['Salary'].mean().to_frame().reset_index().rename(columns={'Salary':'AVG_Salary'})
df_Gender_AmountSpent=df.groupby(by =['Gender'])['AmountSpent'].mean().to_frame().reset_index().rename(columns={'Gender':'Gender1','AmountSpent':'AVG_AmountSpent'})
result = pd.concat([df_Gender_Salary,df_Gender_AmountSpent], axis=1)
result.drop(['Gender1'],inplace=True,axis=1)
result["AVG_Salary"]=result["AVG_Salary"].map(lambda x:round(x,2))
result["AVG_AmountSpent"]=result["AVG_AmountSpent"].map(lambda x:round(x,2))


fig = make_subplots(rows=2, cols=1,
                   subplot_titles=("Gender AVG Salary ",
                                   "Gender AVG Amount Spent " ))  # Subplot titles
                                  

fig.add_trace(go.Bar(
    x=result['Gender'],y=result['AVG_Salary'],
    name="Mean Salary",
    marker={'color': result['AVG_Salary'], 
    'colorscale': 'fall'},  
    text=result['AVG_Salary'],
    textposition = "inside"),
    row=1, col=1         
)
fig.add_trace(go.Bar(
    x=result['Gender'],y=result['AVG_AmountSpent'],
    name="Mean Amount Spent",
    marker={'color': result['AVG_AmountSpent'], 
    'colorscale': 'fall'},  
    text=result['AVG_AmountSpent'],
    textposition = "inside"),
    row=2, col=1         
)

fig.update_layout(title = "Gender ",title_x=0.5)
fig.show()

In [None]:
fig = px.scatter(df, x='Salary', y='AmountSpent',trendline="ols",
                 color='Gender') # Added color to basic scatter
fig.update_layout(title='Gender With Salary Vs Amount Spent ',xaxis_title="Salary",yaxis_title="Amount Spent",title_x=0.5)
fig.show()

In [None]:
df_G_and_A=df.groupby(by =['Gender','Age'])['Married'].count().to_frame().reset_index().rename(columns={'Gender':'Gender','Age':'Age','Married':'count'})
df_G_and_A

In [None]:
df_G_and_A=df.groupby(by =['Gender','Age'])['Married'].count().to_frame().reset_index().rename(columns={'Gender':'Gender','Age':'Age','Married':'count'})

fig = px.bar(df_G_and_A, x="Age", y="count",color="Gender",barmode="group",
             
             )
fig.update_layout(title_text='Age Count With Gender',title_x=0.5,yaxis_title="Count",
                 )
fig.show()

In [None]:
df_G_and_A=df.groupby(by =['Gender','Age'])['AmountSpent'].mean().to_frame().reset_index().rename(columns={'AmountSpent':'AVG_AmountSpent'})
df_G_and_A["AVG_AmountSpent"]=df_G_and_A["AVG_AmountSpent"].map(lambda x:round(x,2))
df_G_and_A

In [None]:
df_G_and_A=df.groupby(by =['Gender','Age'])['AmountSpent'].mean().to_frame().reset_index().rename(columns={'AmountSpent':'AVG_AmountSpent'})
df_G_and_A["AVG_AmountSpent"]=df_G_and_A["AVG_AmountSpent"].map(lambda x:round(x,2))

fig = px.bar(df_G_and_A, x="Age", y="AVG_AmountSpent",color="Gender",barmode="group",
             
             )
fig.update_layout(title_text='Age Amount Spent With Gender,Age',title_x=0.5,yaxis_title="Amount Spent",
                 )
fig.show()

In [None]:
df_age_gender_agg=df.groupby(by=["Age","Gender"])['AmountSpent'].agg(["min","mean","max"])
df_age_gender_agg

In [None]:
df_G_and_A_AVG=df.groupby(by =['Gender','Age'])['AmountSpent'].mean().to_frame().reset_index().rename(columns={'Gender':'Gender','Age':'Age','AmountSpent':'AVG_AmountSpent'})
df_G_and_A_Max=df.groupby(by =['Gender','Age'])['AmountSpent'].max().to_frame().reset_index().rename(columns={'Gender':'Gender1','Age':'Age1','AmountSpent':'Max_AmountSpent'})
df_G_and_A_Min=df.groupby(by =['Gender','Age'])['AmountSpent'].min().to_frame().reset_index().rename(columns={'Gender':'Gender2','Age':'Age2','AmountSpent':'Min_AmountSpent'})
df_G_and_A_Count=df.groupby(by =['Gender','Age'])['AmountSpent'].count().to_frame().reset_index().rename(columns={'Gender':'Gender3','Age':'Age3','AmountSpent':'Count'})
result = pd.concat([df_G_and_A_AVG, df_G_and_A_Max,df_G_and_A_Min,df_G_and_A_Count], axis=1)
result.drop(['Gender1','Gender2','Gender3','Age1','Age2','Age3'],inplace=True,axis=1)
result["AVG_AmountSpent"]=result["AVG_AmountSpent"].map(lambda x:round(x,2))
result["Gender_Age"]=result["Gender"]+" "+result["Age"]
result.drop(['Gender','Age'],inplace=True,axis=1)
result

In [None]:
df_G_and_A_AVG=df.groupby(by =['Gender','Age'])['AmountSpent'].mean().to_frame().reset_index().rename(columns={'Gender':'Gender','Age':'Age','AmountSpent':'AVG_AmountSpent'})
df_G_and_A_Max=df.groupby(by =['Gender','Age'])['AmountSpent'].max().to_frame().reset_index().rename(columns={'Gender':'Gender1','Age':'Age1','AmountSpent':'Max_AmountSpent'})
df_G_and_A_Min=df.groupby(by =['Gender','Age'])['AmountSpent'].min().to_frame().reset_index().rename(columns={'Gender':'Gender2','Age':'Age2','AmountSpent':'Min_AmountSpent'})
df_G_and_A_Count=df.groupby(by =['Gender','Age'])['AmountSpent'].count().to_frame().reset_index().rename(columns={'Gender':'Gender3','Age':'Age3','AmountSpent':'Count'})
result = pd.concat([df_G_and_A_AVG, df_G_and_A_Max,df_G_and_A_Min,df_G_and_A_Count], axis=1)
result.drop(['Gender1','Gender2','Gender3','Age1','Age2','Age3'],inplace=True,axis=1)
result["AVG_AmountSpent"]=result["AVG_AmountSpent"].map(lambda x:round(x,2))
result["Gender_Age"]=result["Gender"]+" "+result["Age"]
result.drop(['Gender','Age'],inplace=True,axis=1)

fig = make_subplots(rows=4, cols=1,
                   subplot_titles=(" Mean Amount Spent",
                                   " Min Amount Spent",
                                   " Max Amount Spent",
                                   " Count "))  # Subplot titles

fig.add_trace(go.Bar(
    x=result['Gender_Age'],y=result['AVG_AmountSpent'],
    name="Mean",
    marker={'color': result['AVG_AmountSpent'], 
    'colorscale': 'fall'},  
    text=result['AVG_AmountSpent'],
    textposition = "inside"),
    row=1, col=1         
)
fig.add_trace(go.Bar(
    x=result['Gender_Age'],y=result['Min_AmountSpent'],
    name="Min",
    marker={'color': result['Min_AmountSpent'], 
    'colorscale': 'fall'},  
    text=result['Min_AmountSpent'],
    textposition = "inside"),
    row=2, col=1         
)
fig.add_trace(go.Bar(
    x=result['Gender_Age'],y=result['Max_AmountSpent'],
    name="Max",
    marker={'color': result['Max_AmountSpent'], 
    'colorscale': 'fall'},  
    text=result['Max_AmountSpent'],
    textposition = "inside"),
    row=3, col=1           
)
fig.add_trace(go.Bar(
    x=result['Gender_Age'],y=result['Count'],
    name="Count",
    marker={'color': result['Count'], 
    'colorscale': 'fall'},  
    text=result['Count'],
    textposition = "inside"),
    row=4, col=1           
)
fig.update_layout(title = "Gender Age With Amount Spent",title_x=0.5)
fig.update_xaxes(
        tickangle = 0,
        )
fig.show()

In [None]:
df_age_gender_agg=df.groupby(by=["Age","Gender"])['Salary'].agg(["min","mean","max"])
df_age_gender_agg