<a href="https://colab.research.google.com/github/DJCordhose/buch-machine-learning-notebooks/blob/master/kap3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fixing Issue in data files

We are going to use the following libraries:

- pandas is a library for creating and manipulating data tables
- matplotlib
- seaborn

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%matplotlib inline
%pylab inline

In [None]:
from distutils.version import StrictVersion

In [None]:
import pandas as pd
print(pd.__version__)

assert StrictVersion(pd.__version__) >= StrictVersion('0.19.0')

In [None]:
import seaborn as sns
print(sns.__version__)

assert StrictVersion(sns.__version__) >= StrictVersion('0.7.0')

### Let's load a data file with issues

In [None]:
df = pd.read_csv('dataset_with_issues.csv',
                 header=None,
                 encoding='iso-8859-15',
                 names=['sepal length', 'sepal width', 'petal length', 'petal width', 'class'])

In [None]:
df.head()

In [None]:
df.count()

#### Before we can find issues in this data set we need to learn how Pandas work!

### Data structures in Pandas
- Series (think of it as a vector)
- Datafraems (think of it as a matrix)

### Series:
this is simply a list of data

In [None]:
s1 = pd.Series([10,20,30])
s1

In [None]:
# We can change the Index name as follows:
s2 = pd.Series({'a' : 10, 'b' : 20, 'c' : 30})
s2

In [None]:
# or like that:
s3 = pd.Series([10,20,30], index=['a', 'b', 'c'])
s3

In [None]:
s3[0]

In [None]:
s3['a']

In [None]:
s3[['a', 'c']]

In [None]:
s3[0:2]

In [None]:
s3[1:]

In [None]:
# with the so-called lambda function you can find a certain value
s3[lambda value: value == 20]

### Dataframes

In [None]:
# A dataframe consists of several Series
df1 = pd.DataFrame(
    {'one': pd.Series([10,20,30], index=['a', 'b', 'c']),
     'two': pd.Series([10,20,30], index=['a', 'b', 'c'])
    })

In [None]:
type(df1)

In [None]:
df1

In [None]:
# Take a look at the documentation of data frames if you want to know more
# pd.DataFrame?

In [None]:
df1['one']

In [None]:
df1['one']['a']

## Let's use dataframes to find missing values and correct them

In [None]:
# visualize the data frame
df['sepal width']

In [None]:
# check for missing entries in sepal width column
df['sepal width'].isnull()

In [None]:
# add lines with missing data to a dataframe
df[df['sepal width'].isnull()]

In [None]:
# view certain lines of data table
iris_versicolor = df[df['class']  == 'Iris-versicolor']
iris_versicolor

In [None]:
iris_versicolor_sepal_width = iris_versicolor['sepal width']
iris_versicolor_sepal_width

In [None]:
meanSepalWidth = pd.Series.mean(iris_versicolor_sepal_width)
meanSepalWidth

In [None]:
df.loc[82]

In [None]:
df.loc[82,'sepal width'] = meanSepalWidth

In [None]:
df.loc[82]

In [None]:
df.count()

## Find duplicated entries

In [None]:
df[df.duplicated(keep=False)]

In [None]:
df.groupby('class').count()

In [None]:
df = df.drop(df.index[[100]])

In [None]:
df.groupby('class').count()

## Fixing typing errors

In [None]:
df[df['class']  == 'Iris-setsoa']

In [None]:
df.loc[49,'class'] = 'Iris-setosa'

In [None]:
df.groupby('class').count()

In [None]:
print(df.groupby('class').count())

## Convert measuring units for more unit consistency

In [None]:
df.head()

In [None]:
pd.to_numeric('2 mm'.replace(' mm', '')) / 10

In [None]:
def convert_from_mm(row):
    return pd.to_numeric(row['petal width'].replace(' mm', '')) / 10

In [None]:
df['petal width'] = df.apply(convert_from_mm, axis='columns')

In [None]:
df.head()

## Find outliers

In [None]:
df.describe()

In [None]:
df.groupby('class').describe()

In [None]:
# df.hist?

In [None]:
df.hist(figsize=(15, 15))
plt.show()
# plt.savefig('ML_0304.png', bbox_inches='tight')

In [None]:
df[df['sepal length'] == 58]

In [None]:
sns.jointplot(df['sepal length'], df['petal length'])
plt.show()
# plt.savefig('ML_0305.png', bbox_inches='tight')

In [None]:
df.loc[143,'sepal length'] = 5.8

In [None]:
# p value:
# https://en.wikipedia.org/wiki/P-value
# http://www.dummies.com/education/math/statistics/what-a-p-value-tells-you-about-statistical-data/
# https://xkcd.com/1478/

sns.jointplot(df['sepal length'], df['petal length'])
plt.show()
# plt.savefig('ML_0306.png', bbox_inches='tight')

In [None]:
# https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

corrmat = df.corr()
sns.heatmap(corrmat, annot=True)
plt.show()
# plt.savefig('ML_0307.png', bbox_inches='tight')

In [None]:
df.to_csv('iris_cleaned.csv', index=False, header=True)

## Further data visualization tools

In [None]:
df.boxplot()

In [None]:
df.groupby('class').hist()
plt.show()

In [None]:
df.groupby('class')['sepal length'].hist()