In [1]:
# To begin this exploratory analysis, first import libraries and define functions and utilities to work with the data.

from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# for beautiful plots and some types of graphs
import seaborn as sns

In [2]:
# IMPORTING THE DATA
# There is 1 csv file in the current version of the dataset

kBaseDataDirectory = "C:\\Users\\Ayush\\OneDrive - BBTech\\Desktop\\sem 6\\BI\\datasets\\French_fashion_c2c\\6M-0K-99K.users.dataset.public.csv"  # on Kaggle
#kBaseDataDirectory = "./kaggle/input"  # when working offline with jupyter notebook

dataset_files = []

# This loop will import all dataset files in case we add more data in a next version of the dataset
for dirname, _, filenames in os.walk(kBaseDataDirectory):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        dataset_files.append(os.path.join(dirname, filename))

In [3]:
#######   Utility functions for statistics   #######

### Helpers to filter dataframes

def helper_has_fields_compared_to(df, columns, target, what, operator):
    """
    Helper to compare several columns to the same value.
    """
    col = columns[0]
    res = df[col] > target
    for col in columns[1:]:
        if operator == '>':
            tmp = (df[col] > target)
        elif operator == '>=':
            tmp = (df[col] >= target)
        elif operator == '<=':
            tmp = (df[col] <= target)
        elif operator == '<':
            tmp = (df[col] < target)
        elif operator == '==':
            tmp = (df[col] == target)
        elif operator == '!=':
            tmp = (df[col] != target)
        
        # 
        if what == 'all':
            res = res & tmp
        elif what in ['any']:
            res = res | tmp
    return res

def helper_has_any_field_greater_than(df, columns, target):
    """Returns lines of the dataframe where any of value of the specified columns
    is greater than the target.
    """
    res = helper_has_fields_compared_to(df, columns, target, 'any', '>')
    return res

def helper_has_all_field_greater_than(df, columns, target):
    res = helper_has_fields_compared_to(df, columns, target, 'all', '>')
    return res


### Other utilities for stats

def frequency(data, probabilities=False, sort=False, reverse=False):
    """Returns the frequency distribution of elements.
    This is a convenience method for effectif()'s most common use case, without all the more complicated parameters.
    :param data: A collection of elements you want to count.
    :param bool probabilities: Whether you want the result frequencies to sum up to 1. Default: False
    """
    xis, nis = effectif(data, returnSplitted=True, frequencies=probabilities, sort=sort, reverse=reverse)
    return xis, nis


def frequences(data, returnSplitted=True, hashAsString=False, universe=None, frequenciesOverUniverse=None):
    """
    """
    if universe is None:
        return effectif(data, returnSplitted, hashAsString, True)
    else:
        return effectifU(data, universe, returnSplitted, hashAsString, True, frequenciesOverUniverse)
    

def effectif(data, returnSplitted=True, hashAsString=False, frequencies=False, inputConverter=None, sort=False, reverse=False):
    """calcule l'effectif
    :param list data: une liste
    :param bool hashAsString: whether we should convert the values in 'data' to
                string before comparing them
    :param function inputConverter: a callable function that is used to convert
                the values within data into the class you want the values to be
                compared as. When not provided, the identity function is used.
                If used with parameter 'hashAsString', the hashed value will be
                the one returned by this function.
    :param bool sort: sort the result (only if returnSplitted). Shorthand for `sortBasedOn`
    :param bool reverse: reverse the order (only if sort and returnSplitted). Shorthand for `sortBasedOn`
    """
    inputConverter = (lambda x: x) if inputConverter is None else inputConverter
    effs = {}
    for val in data:
        val = inputConverter(val)
        key = str(val) if hashAsString else val
        try:
            effs[key] = effs[key]+1
        except:
            effs[key] = 1
    
    if frequencies:
        tot = sum(effs.values())
        for key in effs:
            effs[key] = effs[key]/tot
    
    if returnSplitted:
        xis = list(effs.keys())
        nis = list(effs.values())
        if sort:
            xis, nis = sortBasedOn(nis, xis, nis, reverse=reverse)
        return xis, nis
    
    return effs

In [4]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()

# Correlation matrix
def plotCorrelationMatrix(df, graphWidth, segmentName=None):
    filename = segmentName if segmentName else getattr(df, "dataframeName", segmentName)
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()