In [1]:
from functools import reduce
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabaz_score, silhouette_score
from sklearn.decomposition import PCA

import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
plt.rcParams['figure.figsize'] = (12, 8)

In [3]:
def otherise(df_col, n, lst_top_n=None):
        """Otherise dataframe columns if it contains too many values

        Arguments:
            df_col {series} -- dataframe column
            n {int} -- number of maximum values

        Returns:
            series -- dataframe column with reduced values
        """
        if lst_top_n:
            top_n = lst_top_n
        else:
            z = pd.DataFrame(df_col).copy()
            z['val'] = 1
            z.columns = ['key', 'val']
            top_n = z.groupby('key').size().sort_values(ascending=False)[:n]
            top_n = pd.DataFrame(top_n).reset_index()
            top_n = list(top_n.key.values)
        df_col = df_col.apply(lambda y: y if y in top_n else np.NaN)
        
        return df_col

In [4]:
def get_dummies(x):
    """Turn a column of string values into categorical variables through one-hot encoding.

    :param x: Input data frame with original columns
    :type x: Pandas DataFrame
    :returns x: Output data frame with dummy variables
    :rtype x: Pandas DataFrame
    """
    column_names = []
    for i in range(0, len(x.columns)):
        if x.dtypes[i] == 'object':
            column_names.append(x.columns[i])

    dummies = pd.get_dummies(x[column_names])
    x = pd.concat([x.reset_index(drop=True), dummies], axis=1)
    x = x.drop(column_names, axis=1)    

    return x

In [5]:
dvdrental_demographics = pd.read_csv('dvdrental_demographics.csv', index_col=False,  sep = ";")

In [6]:
dvdrental_time = pd.read_csv('dvdrental_time.csv', index_col=False, sep = ";")

In [7]:
dvdrental_pay = pd.read_csv('dvdrental_pay.csv', index_col=False, sep = ";")

In [8]:
dvdrental_preference = pd.read_csv('dvdrental_preference.csv', index_col=False, sep = ";")

In [9]:
dvdrental_demographics['city'] = otherise(dvdrental_demographics['city'], 2)
dvdrental_demographics['country'] = otherise(dvdrental_demographics['country'], 2)

In [10]:
dvdrental_demographics.head(1)

Unnamed: 0,customer_id,store_id,first_name,address_id,create_date,active,city_id,city,country_id,country
0,524,1,Jared,530,2006-02-14,1.0,419,,45,


In [11]:
aggregations_dm = {
    'store_id': ["max"],
    'city': [lambda x: x.value_counts().index[0] if len(x.value_counts()) else None],
    'country': [lambda x: x.value_counts().index[0] if len(x.value_counts()) else None],
    'active': ["max"]  
    } 

df_dm = dvdrental_demographics.groupby('customer_id').agg(aggregations_dm).reset_index()
df_dm.columns = ['customer_id','store_id','city','country','active']

In [12]:
df_dm.head()

Unnamed: 0,customer_id,store_id,city,country,active
0,1,1,,,1.0
1,2,1,,,1.0
2,3,1,,,1.0
3,4,2,,,1.0
4,5,1,,,1.0


In [13]:
df_dm = get_dummies(df_dm)
df_dm.head()

Unnamed: 0,customer_id,store_id,active,city_Aurora,city_London,country_China,country_India
0,1,1,1.0,0,0,0,0
1,2,1,1.0,0,0,0,0
2,3,1,1.0,0,0,0,0
3,4,2,1.0,0,0,0,0
4,5,1,1.0,0,0,0,0


In [14]:

dvdrental_time['return_date'] = np.where(dvdrental_time['return_date'] == '\\N','2013-05-26 14:50:58' ,dvdrental_time['return_date']) 

dvdrental_time['rental_date'] = pd.to_datetime(dvdrental_time['rental_date'])
dvdrental_time['return_date'] = pd.to_datetime(dvdrental_time['return_date'])

dvdrental_time['rent_time'] = (dvdrental_time['return_date'] - dvdrental_time['rental_date']).astype('timedelta64[D]')

In [15]:
aggregations_time = {
    'rental_date': { # work on the "rental_date" column
        'max_rental_date': 'max',  # get the max, and call this result 'max_rental_date'
        'min_rental_date': 'min', # get min, call result 'min_rental_date'
        'total_dvds': 'count'
    },
    'rent_time': {     # Now work on the "rent_time" column
        'ave_rent_time': 'median'   # Find the median, call the result "ave_rent_time"
        }
}

# Perform groupby aggregation by "customer_id"
df_time = dvdrental_time.groupby('customer_id').agg(aggregations_time).reset_index()
df_time.columns = ['customer_id','max_rental_date','min_rental_date','total_dvds','ave_rent_time']

SpecificationError: nested renamer is not supported