<a href="https://colab.research.google.com/github/Granero0011/DS-Unit-1-Sprint-1-Dealing-With-Data/blob/master/Data_Cleaning_and_Exploring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#basic importing and aliasing
import numpy as np
import pandas as pd

In [0]:
#prevent dataframe truncation by adjusting pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#Reading in Data

In [0]:
#read in csv
black_friday = pd.read_csv('https://raw.githubusercontent.com/pierretd/datasets-1/master/BlackFriday.csv')
print(black_friday.head())

#read in json
#json load tutorial: https://chrisalbon.com/python/data_wrangling/load_json_file_into_pandas/
space = pd.read_json('https://raw.githubusercontent.com/pierretd/datasets-1/master/PeopleInSpaceNow.json')
print(space.head())

#read in tsv
fast_fm = pd.read_csv('https://raw.githubusercontent.com/pierretd/datasets-1/master/FastFmProfiles.tsv', delimiter='\t', encoding='utf-8')
print(fast_fm.head())

In [0]:
#uploading a file
from google.colab import files
upload = files.upload()

In [0]:
#renaming headers during upload
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
auto = pd.read_csv(url,names=['symboling','norm_loss','make','fuel','aspiration','doors',
                              'bod_style','drv_wheels','eng_loc','wheel_base','length','width',
                              'height','curb_weight','engine','cylinders','engine_size',
                              'fuel_system','bore','stroke','compression','hp','peak_rpm',
                              'city_mpg','hgwy_mpg','price'])

#Data Cleaning

In [0]:
#rename columns after reading in
flag_data = pd.read_csv(flag_data_url, header=None)
feature_map = {0: 'name',
               1: 'landmass',
               2: 'zone',
               3: 'area',
               4: 'population',
               5: 'language',
               6: 'religion',
               7: 'bars',
               8: 'stripes',
               9: 'colours',
               10: 'red',
               11: 'green',
               12: 'blue',
               13: 'gold',
               14: 'white',
               15: 'black',
               16: 'orange',
               17: 'mainhue',
               18: 'circles',
               19: 'crosses',
               20: 'saltires',
               21: 'quarters',
               22: 'sunstars',
               23: 'crescent',
               24: 'triangle',
               25: 'icon',
               26: 'animate',
               27: 'text',
               28: 'topleft',
               29: 'botright'}

flag_data.rename(columns=feature_map, inplace=True)

In [0]:
#replacing values in a data set with a new value
auto = auto.replace('?', np.NaN)

In [0]:
#setting categories to type 'category' for faster operations
auto['make'] = auto['make'].astype('category')

In [0]:
#label encoding
auto['make'] = auto['make'].cat.codes

In [0]:
#one hot encoding
auto = pd.get_dummies(auto, columns=['make'], prefix = ['make'])
auto.head()

In [0]:
!pip install category_encoders as ce
#binary encoding
encoder = ce.BinaryEncoder(cols=['make'])
auto = encoder.fit_transform(auto)

In [0]:
#filling null values (will not run, just example)
df.fillna(0) #fill nulls with a value
df.fillna(method='ffill') #fill based on a method chosen from  {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}
df.fillna(value={'A': 0, 'B': 1, 'C': 2, 'D': 3}) #fill each feature with a different value
df.fillna(value=values, limit=1) #fill a set number of nulls

In [0]:
#dropping null values
df.dropna() #drop rows
df.dropna(axis='columns') #drop columns
df.dropna(axis='rows', thresh=3) # drop rows with fewer non-nulls than the threshhold

In [0]:
#add a column (will not run, just example)
df['new feature'] = 'Value' #new column with same value for all rows
df['new feature'] = df2['df2 feature'] #new column from another dataframe column

In [0]:
#combine two data frames (will not run, just example)
df3 = df1.append(df2)

In [0]:
#use np.where to encode a category based on values in another category (will not run, just example)
df['feature1'] = np.where(df['feature2'].str.contains('value'), 1, 0)

In [0]:
#loop to impute values based on whether they are numeric or categorical (will not run, just example)
from pandas.api.types import is_numeric_dtype

for header in comic_chars_df:
  if is_numeric_dtype(comic_chars_df[header]):
    #do something with the numeric features here
  else:
    #do something with the categorical features here

#Exploring Data

In [0]:
#show X rows in the dataframe, no value defaults to 5
auto.head()

In [0]:
#summary of the dataframe
auto.info()

In [0]:
#get basic stats on any numeric features
auto.describe()

In [0]:
#dimensions of the dataframe
auto.shape

In [0]:
#what are the data types of the different features
auto.dtypes

In [0]:
#how many items in each feature of a dataframe
auto.count()

In [0]:
#count how many of each item in a feature
auto['make'].value_counts()

In [0]:
#count how many unique values in a category
auto['make'].value_counts().count()
auto['make'].nunique()

In [0]:
#how many null or non-null values are there and what features are they in
auto.isnull().sum()
auto.notnull().sum()

In [0]:
#addittional useful pandas calls
df.tail() 
df.apply()
df.map()
df.between()
df.unique()
df.index
df.values
df.isin()
df.astype()
df.iloc[]
df.loc[]
df.groupby()
df.T
df.interpolate() #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html

In [0]:
#binning with pd.cut (will not run, just example)
time_bins = pd.cut(df['time'], 5) #create 5 bins of time

In [0]:
#crosstab for quick small tables (will not run, just example)
pd.crosstab(df['purchased'], time_bins, normalize='columns')

In [0]:
#pivot table
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html
table = pd.pivot_table(auto, values=['length'],index=['make'],columns=['bod_style']) #pivot table showing length values against make and bod_style

table = pd.pivot_table(auto, values=['length'],index=['make'],columns=['bod_style'], aggfunc=np.sum) #pivot table showing summing length values against make and bod_style