# <h1 style="text-align: center"><b>184.702 Machine Learning</b>: Exercise 0, 2022S</h1>
<h5 style="text-align: center">Group 08: Wanecek Wilhelm, Simhandl Stefan, Beck Viktor</h5>

In [None]:
# Install dependencies
!pip3 install -q pandas_profiling

In [None]:
import os
path= ''
currentdirectory = os.path.abspath(path)
os.chdir(currentdirectory)

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from pandas_profiling import ProfileReport

# Kinematics-motion-dataset

source:
https://www.kaggle.com/yasserh/kinematics-motion-data

In [None]:
# import raw data
csvpath = os.path.join("datasets", "kinematics-motion-data.csv")
kinematics_df = pd.read_csv(csvpath, sep=',', encoding='latin-1')
display(kinematics_df.info())
display(kinematics_df.head())

## Explore kinematics-motion-dataset

In [None]:
print("Shape of dataset (row, columns):", kinematics_df.shape)
print('Missing values:\n', kinematics_df.isna().sum())

In [None]:
# Get descriptive statistical properties of dataset
kinematics_df.describe()

In [None]:
# 1) Date, Time and Username are no universal features to predict the activites Running and Walking 
# 2) Further it makes no sense to explore them so we can drop them before exploring the data
kinematics_df = kinematics_df.drop(columns=['username','date','time'],axis=1)

# Explore dataset using pandas profiling
kinematics_df = ProfileReport(kinematics_df, minimal = False)
report_path = os.path.join('reports', 'explore_kinematics-motion-dataset.html')
kinematics_df.to_file(output_file=report_path)

# Communities and Crime dataset

source:
https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime

In [None]:
# import raw data
csvpath = os.path.join("datasets", "communities_crime.csv")
cc_df = pd.read_csv(csvpath, sep=';', encoding='latin-1')

display(cc_df.info())
display(cc_df.head())

In [None]:
# get US state postcode mapping from https://github.com/scpike/us-state-county-zip/blob/master/geo-data.csv
csvpath = os.path.join("datasets", 'US_state-fips.csv')
state_fips = pd.read_csv(csvpath, sep=',', encoding='latin-1')

display(state_fips.head(3))
state_fips = state_fips.drop(state_fips.columns[2:6],axis=1)
display(state_fips.head(3))

## Explore communities and crime dataset

In [None]:
print("Shape of dataset (row, columns):", cc_df.shape)
#print('Missing values:\n', cc_df.isna().sum()) # better overview by Pandas Profiling report

In [None]:
# Get descriptive statistical properties of dataset
cc_df.describe()

In [None]:
# How many columns are non-predictive?
def listNonPredictiveCols(df: pd.DataFrame) -> int:
    nonPredictiveCols = df.columns[df.columns.str.contains('predictive')]
    return nonPredictiveCols

cc_nonPredictiveCols = listNonPredictiveCols(cc_df)
display(len(cc_nonPredictiveCols), cc_nonPredictiveCols)

In [None]:
## drop non predicitve features
# 1) state, fold, county, community and communityname are no universal features to predict the crime rate (state and community name kept as indexes for additional information might be dropted for ML)
# 2) Further it makes no sense to explore them so we can drop them before exploring the data
df = cc_df.copy().rename(columns={'state: US state (by number) - not counted as predictive above, but if considered, should be consided nominal (nominal)': 'state_fips',
                                  'communityname: community name - not predictive - for information only (string)': 'communityname'})#, index={'old': 'new'})

df = df.drop(columns=['fold: fold number for non-random 10 fold cross validation, potentially useful for debugging, paired tests - not predictive (numeric)',
                      'county: numeric code for county - not predictive, and many missing values (numeric)',
                      'community: numeric code for community - not predictive and many missing values (numeric)'],axis=1)

df = pd.concat([df, state_fips], axis=1, join="inner").drop(columns='state_fips')
df = df.set_index(['state','communityname'])
display(df.head(3))
df.shape

# Explore dataset using pandas profiling. Since this df is so big, we only output a minimal report.
cc_prof = ProfileReport(df, minimal = True)
report_path = os.path.join('reports', 'explore_communities-crime-dataset.html')
cc_prof.to_file(output_file=report_path)

In [None]:
# compute nbr of attributes with ratio respective interval data
cols = list(df.columns.values)
nbr_ratio = pd.Series(cols).str.count('[P,p]ct').sum()
print('Number of attributes with ratio data :', nbr_ratio)
print('Number of attributes with interval data:', len(cols)-nbr_ratio)