# Task 2 Clustering
*Alberto Roberto Marinelli, Giacomo Cignoni, Alessandro Bucci*
## Importing Libraries
First we import the libraries necessary to cluster the data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [9]:
df = pd.read_csv("../dataset/user_indicators.csv")
df.head(10)

Unnamed: 0,user_id,name,lang,bot,created_at,statuses_count,tweet_count,avg_tweet_len,total_num_of_likes,like_ratio_per_tweet,tweet_outside_of_possible_publishing_years,entropy_day,entropy_second,entropy_minute,entropy_15minutes,entropy_hour
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76.0,132,62.378788,5,0.037879,0,1.227162,5.634505,2.535773,1.792298,1.777031
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54.0,122,68.762295,6,0.04918,0,1.281955,5.424588,2.302317,1.69443,1.69443
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3.0,4,19.25,0,0.0,0,,,,,
3,466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50.0,1439,86.76303,240,0.166782,0,0.347218,9.381856,7.495216,4.504821,2.935312
4,2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085.0,3656,72.214442,205,0.056072,0,0.0,9.403373,5.999865,2.945209,1.532413
5,2199062688,Qq,en,0,2018-11-19 06:58:18,7406.0,2081,45.100913,1531,0.735704,0,0.265405,8.974019,5.677881,3.09743,1.982418
6,1174869769,Tania Wren,en,1,2018-02-15 10:51:28,21.0,24,64.041667,2272,94.666667,0,2.750914,4.316828,4.175736,3.931838,3.844882
7,3118659848,charlotte gray,en,0,2020-03-31 21:19:59,115.0,135,62.4,125,0.925926,0,0.719549,6.391175,5.602282,4.196843,3.108672
8,616225564,Anisha Williams,en,0,2017-06-25 15:49:36,,60,66.883333,3,0.05,0,3.087111,5.475863,5.153498,5.085702,4.863902
9,2357425536,Meda Tatlock,en,1,2019-02-25 04:30:56,69.0,150,64.286667,5,0.033333,0,1.180164,5.400191,2.239529,1.656572,1.629726


In [10]:
cat_feature = df[['user_id', 'name', 'lang', 'created_at']]
del df['user_id']
del df['name']
del df['lang']
del df['created_at']
del df['bot']
df.head()

Unnamed: 0,statuses_count,tweet_count,avg_tweet_len,total_num_of_likes,like_ratio_per_tweet,tweet_outside_of_possible_publishing_years,entropy_day,entropy_second,entropy_minute,entropy_15minutes,entropy_hour
0,76.0,132,62.378788,5,0.037879,0,1.227162,5.634505,2.535773,1.792298,1.777031
1,54.0,122,68.762295,6,0.04918,0,1.281955,5.424588,2.302317,1.69443,1.69443
2,3.0,4,19.25,0,0.0,0,,,,,
3,50.0,1439,86.76303,240,0.166782,0,0.347218,9.381856,7.495216,4.504821,2.935312
4,7085.0,3656,72.214442,205,0.056072,0,0.0,9.403373,5.999865,2.945209,1.532413


## Clustering Preprocessing - Normalization  
A good practice in clustering to avoid the bias given by the range of the different attribute is normalization.  
The most common adopted normalizations are: Z-Score and Min-Max

In [11]:
scaler = StandardScaler()
scaler.fit(df.values)
df.head().values

array([[7.60000000e+01, 1.32000000e+02, 6.23787879e+01, 5.00000000e+00,
        3.78787879e-02, 0.00000000e+00, 1.22716218e+00, 5.63450532e+00,
        2.53577317e+00, 1.79229793e+00, 1.77703075e+00],
       [5.40000000e+01, 1.22000000e+02, 6.87622951e+01, 6.00000000e+00,
        4.91803279e-02, 0.00000000e+00, 1.28195540e+00, 5.42458839e+00,
        2.30231747e+00, 1.69443018e+00, 1.69443018e+00],
       [3.00000000e+00, 4.00000000e+00, 1.92500000e+01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00,            nan,            nan,
                   nan,            nan,            nan],
       [5.00000000e+01, 1.43900000e+03, 8.67630299e+01, 2.40000000e+02,
        1.66782488e-01, 0.00000000e+00, 3.47217770e-01, 9.38185622e+00,
        7.49521642e+00, 4.50482128e+00, 2.93531248e+00],
       [7.08500000e+03, 3.65600000e+03, 7.22144420e+01, 2.05000000e+02,
        5.60722101e-02, 0.00000000e+00, 0.00000000e+00, 9.40337347e+00,
        5.99986507e+00, 2.94520904e+00, 1.53241349e+

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(df.values)
X.head()

## K-MEANS

In [12]:
def KMeansFunction (X, n_clusters_list, n_init_value, max_iter_value=300):
    kmeans_list = list()

    for numCluster in n_clusters_list:

        kmeans = KMeans(n_clusters=numCluster, n_init=n_init_value, max_iter=max_iter_value)
        kmeans.fit(X)
        kmeans_list.append(kmeans)
    
    return kmeans_list

In [13]:
clusters_list = [2,3,4,5,6,8,10]
init_value = 10
iter_value = 300

kmeans_list = KMeansFunction (df, clusters_list, init_value, iter_value)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').