In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Load in the data
df = pd.read_csv(r"../data/first_1000_JSON_files.csv", low_memory=True)

# Create a copy of the dataframe to work with
df_copy = df.copy()

In [None]:
df.columns

In [None]:
# show the unique values in the 'event.key' column and their counts
df['event.key'].value_counts()

In [None]:
# show the entire value in the 'imei' column. not the truncated version
pd.set_option('display.max_colwidth', None)

In [None]:
df_copy.head(5)

In [None]:
df.describe()

In [None]:
# show a few values in the 'imei" column and sohw the entire value not the truncated version
df['imei'].value_counts()

# change the value in the imei column to display  from scientific notation to standard notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

df.dtypes

# change imei column to a int64 data type
df['imei'] = df['imei'].astype('int64')

# show the duplicates in the imei column
df['imei'].value_counts()

# show the top 10
df['imei'].value_counts().head(10)

In [None]:
df_copy.info()

In [None]:
# Show the columns where the number of NaN values is at least 80% of the rows
df_copy.columns[df_copy.isna().sum() > 0.8 * len(df_copy)]

# Save these columns to a variable called columns_to_drop_because_of_80_percent_NaN
columns_to_drop_because_of_80_percent_NaN = df_copy.columns[df_copy.isna().sum() > 0.8 * len(df_copy)]

# Drop these columns from the dataframe
df_copy = df_copy.drop(columns=columns_to_drop_because_of_80_percent_NaN)

In [None]:
df_copy

In [None]:
# Drop the columns that are not useful for the analysis
columns_to_drop = ['_id', 'imei', 'dts', 'dtd', 'event.dte', 'tracker.loc.dtg', 'tracker.loc.hdop', 'tracker.loc.alt', 'tracker.loc.ang', 'tracker.loc.sp', 'tracker.gsm.mcc', 'tracker.gsm.mnc', 'tracker.gsm.lac', 'tracker.gsm.cid',
                   'tracker.metric.deculock', 'tracker.metric.dstatus', 'file_name']

# Check if the columns are in the dataframe and remove the ones that are
for column in columns_to_drop:
    if column in df_copy.columns:
        df_copy = df_copy.drop(columns=column)

In [None]:
df_copy

In [None]:
# Show the number of NaN values per column and order them from highest to lowest
df_copy.isna().sum().sort_values(ascending=False)

In [None]:
# Show the percentage of NaN values per column and order them from highest to lowest
df_copy.isna().sum().sort_values(ascending=False) / len(df_copy)

In [None]:
# Remove the rows where the number of NaN values is at least 50% of the columns
df_copy = df_copy.dropna(thresh=0.5*len(df_copy.columns))

In [None]:
# one hot encode the column "event.key" and drop the original column
df_copy = pd.get_dummies(df_copy, columns=['event.key'], drop_first=True)

In [None]:
df_copy

In [None]:
# Show the amount of naN vales per column
df_copy.isna().sum()

# Remove the columns with more than 100000 NaN values
df_copy = df_copy.dropna(axis=1, thresh=100000)

df_copy

# Create a dataframe called df_variables that is a copy of df_copy
df_variables = df_copy.copy()

In [None]:
df_variables

In [None]:
df_variables.info()

In [None]:
# for the NaN values, fill them with the mean of the column
df_variables = df_variables.fillna(df_variables.mean())

In [None]:
# Create a histogram for all the numerical columns
df_variables.hist(figsize=(40, 40))
plt.show()

In [None]:
# create a correlation matrix
correlation_matrix = df_variables.corr()
correlation_matrix

In [None]:
# create boplots for the numerical columns, to see if there are any outliers
for column in df_variables.columns:
    if df_variables[column].dtype in ['float64', 'int64']:
        plt.boxplot(df_variables[column])
        plt.title(column)
        plt.show()

        # show outliers
        Q1 = df_variables[column].quantile(0.25)
        Q3 = df_variables[column].quantile(0.75)
        IQR = Q3 - Q1
        print(f"Outliers for {column}:")
        print(df_variables[(df_variables[column] < (Q1 - 1.5 * IQR)) | (df_variables[column] > (Q3 + 1.5 * IQR))][column])
        

In [None]:
# use a kmeans clustering algorithm to cluster the data and see if there are any patterns, visualise the clusters
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# scale the data
scaler = StandardScaler()

df_variables_scaled = scaler.fit_transform(df_variables)

# create a kmeans model
kmeans = KMeans(n_clusters=50, random_state=0)

# fit the model
kmeans.fit(df_variables_scaled)

# create a new column in the dataframe called "cluster" and assign the cluster to each row
df_variables['cluster'] = kmeans.labels_

# create a scatter plot of the clusters
plt.scatter(df_variables['device.metric.btemp'], df_variables['device.metric.bpackv'], c=df_variables['cluster'])
plt.xlabel('temperature')
plt.ylabel('voltage')
plt.title('Clusters of locations')
plt.show()

In [None]:
df_variables

In [None]:
# visualize the outliers
plt.scatter(df_variables['device.metric.btemp'], df_variables['device.metric.bpackv'])
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('Outliers of locations')
plt.show()