In [None]:
import pandas as pd
import numpy as np

In [None]:
train_df = pd.read_csv('Data/trainData.csv') # creating a dataframe
                                             # for the given csv file

# DATA CLEANING

In [None]:
# now we find the percentage of null/NaN values in the data
rows, columns = train_df.shape # assign the rows and columns variables
cellCount = rows * columns
numberOfNulls = train_df.isnull().sum().sum() # summing the number of missing values
                                              # under each column
percentageOfMissingValues = (numberOfNulls / cellCount) * 100
print(percentageOfMissingValues)

## Clearly, we see that a very small percentage of the entire dataset is missing or having
## a NaN value. This means that we can drop the rows which contain missing values without
## affecting the dataset much

In [None]:
# remove all the rows containing NaN/missing values
train_df = train_df.dropna()
train_df

In [None]:
train_df.isnull().sum() # verifying if the number of missing values have been dropped

## Now we need to make sure that each of the data under each column falls under the
## reasonable allowed range or data-type of the corresponding column header
## For example, we need to make sure that year or months or days must not have any other
## values except for positive integers

In [None]:
train_df.dtypes # to see the overview of the datatypes under each column right now

## We see that 'year' has a data type of object while it should have been int64
## And similarly for 'month', 'day', 'pressure' and so on...
## Now if the column has anomalous values we replce it by either the mean or median
## of that corresponding column.
## If the column has no anomalous values but the data type set is wrong, we set it
## with the right data type.

In [None]:
# setting the serial number as unsigned int
train_df['Unnamed: 0'] = train_df['Unnamed: 0'].astype(np.unsignedinteger)


train_df['year'] = train_df['year'].astype(np.unsignedinteger) # setting it as
                                                               # unsigned int
replacementValue = train_df['year'].median()
# first using mask() function to mark the changes for only the column being considered
train_df.year = train_df.year.mask(train_df.year.lt(0), replacementValue)
train_df['year'] = train_df['year'].astype(np.unsignedinteger) # re-setting to uint

# for "month"
replacementValue = train_df['month'].median()
train_df.month = train_df.month.mask(train_df.month.lt(0), replacementValue)
train_df['month'] = train_df['month'].astype(np.unsignedinteger)

# for "day"
replacementValue = train_df['day'].median()
train_df.day = train_df.day.mask(train_df.day.lt(0), replacementValue)
train_df['day'] = train_df['day'].astype(np.unsignedinteger)

# for "hour"
replacementValue = train_df['hour'].median()
train_df.hour = train_df.hour.mask(train_df.hour.lt(0), replacementValue)
train_df['hour'] = train_df['hour'].astype(np.unsignedinteger)

In [None]:
print(train_df.dtypes)
train_df

In [None]:
train_df['pressure'] = train_df['pressure'].astype(np.float64)

## We see here that under the "presure" column, only one entry is in string data type.
## So we shall replace it by the median under the column

In [None]:
train_df.at[0, "pressure"] = 0 # converting all data to float type first
train_df['pressure'] = train_df['pressure'].astype(np.float64) # setting the data type
                                                               # of column to float
train_df.at[0, "pressure"] = train_df['pressure'].median() # replaceing by median

In [None]:
train_df.dtypes

## All the data types of the corresponding columns are correct now.

In [None]:
# We rename the column "Unnamed: 0" to "Train_SerialNo"
train_df = train_df.rename(columns={"Unnamed: 0":"Train_SerialNo"})

In [None]:
train_df

# DATA DESCRIPTION

In [None]:
# Describing the data frame using describe() function
print(train_df.info())
train_df.describe()

In [None]:
# Most frequent values (mode) under each column
print("\nThe mode values are :-\n")
(train_df.mode())

In [None]:
# Mean value under each column
print("\nThe mean values are :-\n")
train_df.mean()

In [None]:
# Median values under each column
print("\nThe median values are :-\n")
train_df.median()