In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
pd.options.plotting.backend = "plotly"

In [69]:
df = pd.read_csv('datasets/users.db.csv')
categorical_columns = ['gender', 'voyage', 'laugh', 'photo_keke', 'photo_beach']
df.drop(columns=['last.pr.update'], inplace=True)
df[['last.up.photo', 'last.connex', 'date.crea']] = df[['last.up.photo', 'last.connex', 'date.crea']].apply(pd.to_datetime)
df['account_age'] = (df['last.connex'] - df['date.crea']).dt.days
df.drop(df[df['account_age'] < 0].index, inplace = True)
df.columns = df.columns.str.replace('.', '_')
df.head()


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



Unnamed: 0,userid,date_crea,score,n_matches,n_updates_photo,n_photos,last_connex,last_up_photo,gender,sent_ana,length_prof,voyage,laugh,photo_keke,photo_beach,account_age
0,1,2011-09-17,1.495834,11,5,6,2011-10-07,2011-10-02,1,6.490446,0.0,0,0,0,0,20
1,2,2017-01-17,8.946863,56,2,6,2017-01-31,2017-02-03,1,4.589125,20.722862,0,0,0,1,14
2,3,2019-05-14,2.496199,13,3,4,2019-06-17,2019-06-19,1,6.473182,31.399277,0,0,0,1,34
3,4,2015-11-27,2.823579,32,5,2,2016-01-15,2015-12-09,0,5.368982,0.0,0,0,0,1,49
4,5,2014-11-28,2.117433,21,1,4,2015-01-15,2015-01-02,0,5.573949,38.510225,0,1,0,0,48


In [70]:
corr = df.drop(columns=categorical_columns).corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,userid,score,n_matches,n_updates_photo,n_photos,sent_ana,length_prof,account_age
userid,1.0,0.010505,0.004513,0.008379,-0.035393,-0.013198,-0.006054,0.005241
score,0.010505,1.0,0.902517,0.29173,0.054298,0.392814,-0.0431,-0.128675
n_matches,0.004513,0.902517,1.0,0.319987,-0.004403,0.44137,-0.027701,0.003714
n_updates_photo,0.008379,0.29173,0.319987,1.0,-0.019342,0.136072,-0.014271,-0.00202
n_photos,-0.035393,0.054298,-0.004403,-0.019342,1.0,-0.046853,-0.040108,-0.220414
sent_ana,-0.013198,0.392814,0.44137,0.136072,-0.046853,1.0,0.015214,0.041083
length_prof,-0.006054,-0.0431,-0.027701,-0.014271,-0.040108,0.015214,1.0,0.132073
account_age,0.005241,-0.128675,0.003714,-0.00202,-0.220414,0.041083,0.132073,1.0


In [71]:
from scipy.stats import chi2_contingency

def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

rows= []
data_encoded = df[categorical_columns]

for var1 in data_encoded:
  col = []
  for var2 in data_encoded :
    confusion_matrix = pd.crosstab(df[var1], df[var2])
    cramers = cramers_v(confusion_matrix.values) # Cramer's V test
    col.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V  
  rows.append(col)
  
cramers_results = np.array(rows)

corr = pd.DataFrame(cramers_results, columns = data_encoded.columns, index =data_encoded.columns)
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,gender,voyage,laugh,photo_keke,photo_beach
gender,1.0,0.03,0.0,0.14,0.19
voyage,0.03,1.0,0.0,0.0,0.0
laugh,0.0,0.0,1.0,0.0,0.0
photo_keke,0.14,0.0,0.0,1.0,0.05
photo_beach,0.19,0.0,0.0,0.05,1.0


In [72]:
df['gender'].replace({0 : 'Female', 1 : 'Male', 2 : 'Unknown'}, inplace=True)

In [73]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

mod = ols('account_age ~ gender',
                data=df).fit()
                
aov_table = sm.stats.anova_lm(mod, typ=1)
print(aov_table)
df.plot.box(x='gender', y='account_age')

              df         sum_sq        mean_sq            F  PR(>F)
gender       2.0  417481.070234  208740.535117  1436.471866     0.0
Residual  2986.0  433909.812336     145.314740          NaN     NaN


In [74]:
mod = ols('n_photos ~ gender',
                data=df).fit()

aov_table = sm.stats.anova_lm(mod, typ=1)
print(aov_table)
df.plot.box(x='gender', y='n_photos')

              df       sum_sq     mean_sq           F        PR(>F)
gender       2.0   893.116126  446.558063  170.965198  5.045970e-71
Residual  2986.0  7799.379023    2.611982         NaN           NaN


# Dimension Reduction

In [75]:
from sklearn.preprocessing import StandardScaler

features = ['score', 'n_matches', 'n_updates_photo', 'n_photos', 'sent_ana', 'length_prof', 
            'account_age', 'voyage', 'laugh', 'photo_keke', 'photo_beach']
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

In [76]:
from sklearn.decomposition import PCA

X = df[features]
pca = PCA(n_components=2)
components = pca.fit_transform(X)

loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

fig = px.scatter(components, x=0, y=1, color=df['gender'])

for i, feature in enumerate(features):
    fig.add_shape(
        type='line',
        x0=0, y0=0,
        x1=loadings[i, 0],
        y1=loadings[i, 1]
    )
    fig.add_annotation(
        x=loadings[i, 0],
        y=loadings[i, 1],
        ax=0, ay=0,
        xanchor="center",
        yanchor="bottom",
        text=feature,
    )
    
fig.show()