# Pandas Introduction

In [None]:
import numpy as np
import pandas as pd

## Series object

In [None]:
series_obj = pd.Series([10,20,30,40,50])
series_obj

In [None]:
# index access
series_obj[0]

### Element-wise operations

In [None]:
series_ages = pd.Series([31,22,43,44,55])
series_ages

In [None]:
series_ages + series_ages

In [None]:
series_ages*2

In [None]:
series_ages + 100

### Boolean selection

In [None]:
series_ages>40

In [None]:
#boolean access
series_ages[series_ages>40]

## DataFrame object

In [None]:
# create a DataFrame using dictionary (of Series objects)
data = {"Name": ["Tim Miller", "Ann Carter", "Ellen Lee", "Sam Carr", "Al Ball", "Carl Zee", "Sara Martin"], 
        "Gender": ["Male", "Female", "Female", "Male", "Male", "Male", "Female"],
        "Age": [32, 44, 21, 19, 45, 27, 39]}
df = pd.DataFrame(data)
# print(df)  #when using print(), the DataFrame does not display as an HTML table
df

In [None]:
# show first 5 rows
df.head() # == df.head(5)

In [None]:
# show last 5 rows
df.tail()  # == df.tail(5)

In [None]:
# returns a column/Series object
df['Name']     # dictionary notation

In [None]:
df.Name     # attribute notation; Tab completion

In [None]:
# assignment by column (or add a column)
df["Birth Year"] = 1999
df

In [None]:
# assignment by column (or add a column)
df["Married"] = ['Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No']     # must match the length of the DataFrame
df

## Selection and Filtering
### Column selection

In [None]:
# create a new DataFrame
data = pd.DataFrame(np.arange(100).reshape(10,10), columns = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
data

In [None]:
data['a']

In [None]:
data[["a", "e", "j"]]    # providing a list selects multiple columns

In [None]:
data[["j", "e", "a"]]

### Row selection

In [None]:
data[:1]     # use slice syntax to select rows

In [None]:
data[5:9]

In [None]:
#boolean
data["j"] > 40

In [None]:
# boolean selection
data[data["j"] > 40]

### Row and Column selection with loc
Allows you to select a subset of the rows and columns using the label/name of the row/column

In [None]:
data

In [None]:
# loc implies the name/label of the row and column
data.loc[:5, "b"]

In [None]:

data.loc[6:, 'a':'e']     # consecutive (loc selection is inclusive)

In [None]:
data.loc[:, ['c', 'f', 'i']]     # not consecutive

### Row and Column selection with iloc
Allows you to select a subset of the rows and columns using the integer/index position of the row/column

In [None]:
# iloc is for integer/index selection  (iloc selection is exclusive)
data.iloc[:5, 2:5]

In [None]:
data.iloc[4]  # gives you a row, assumes all of the columns

In [None]:
data.iloc[[5, 0, 3], [9, 5, 0]]  # returns selections in the order listed

# Data Exploration

### Descriptive and summary statistics

In [None]:
olympics_df = pd.read_csv("2016_Olympics.csv")

In [None]:
olympics_df.head()

In [None]:
olympics_df.columns

In [None]:
# drop features

# drop returns a copy
olympics_df = olympics_df.drop(['id', 'name'], axis=1)

olympics_df.head()

In [None]:
# transform height and weight to inches and pounds
# 1 meter = 39.3700787 inches
# 1 kg = 2.20462262 pounds

inches = 39.3700787
pounds = 2.20462262

# element-wise operations
olympics_df["height(in)"] = olympics_df["height"]*inches 
olympics_df["weight(lbs)"] = olympics_df["weight"]*pounds 

olympics_df.head()

In [None]:
olympics_df.describe()

In [None]:
# useful methods for describing the data

olympics_df["height"].min()

# olympics_df["height"].max()
# olympics_df["height"].mean()
# olympics_df["height"].count()
# olympics_df["gold"].sum()

In [None]:
olympics_df.info()

In [None]:
olympics_df.describe(include="object")

In [None]:
# returns only the unique items in a feature
olympics_df["sport"].unique()

In [None]:
# returns the number of unique items in a feature
olympics_df["sport"].nunique()

In [None]:
# orders the unique items (but is slower with large datasets)
set(olympics_df["sport"])

### Correlated features

In [None]:
olympics_df.corr()

In [None]:
import seaborn as sns

# hides Jupyter warnings
import warnings
warnings.filterwarnings('ignore')

# a pairplot enables you to visualize pair-wise relationships between features
pair_plot = sns.pairplot(olympics_df[["sex", "height", "weight"]], hue='sex')

### Feature Engineering and Transformation

In [None]:
# combine height and weight into BMI feature
# bmi = weight(kg)/height(m)**2

olympics_df["bmi"] = olympics_df["weight"]/(olympics_df["height"]**2)

olympics_df.head()

In [None]:
# drop height and weight
olympics_df = olympics_df.drop(["height", "weight", "height(in)", "weight(lbs)"], axis=1)

olympics_df.head()

In [None]:
# create a feature representing total medals won
olympics_df["medal_ct"] = olympics_df[["gold", "silver", "bronze"]].sum(axis = 1)
olympics_df.head()

In [None]:
#transform strings to integers
olympics_df["sex"] = olympics_df["sex"].map({"female":0, "male":1})
olympics_df.head()

### Boolean Selection 

In [None]:
olympics_df["sport"]=="athletics"

In [None]:
olympics_df["sport"].isin(["athletics","volleyball"])

In [None]:
# use count() to get the quantity of items in a column
olympics_df.loc[olympics_df["sport"].isin(["athletics","volleyball", "gymnastics"]), "sport"].count()

In [None]:
# use value_counts() to get the quantity of each unique item in a column
olympics_df.loc[olympics_df["sport"].isin(["athletics","volleyball", "gymnastics"]), "sport"].value_counts()

In [None]:
# And (use an ampersand)
olympics_df.loc[(olympics_df["bmi"] > 29) & (olympics_df["gold"] > 0), "sport"].count()

In [None]:
# And
olympics_df.loc[(olympics_df["bmi"] < 19) & (olympics_df["nationality"] == "USA")].count()

In [None]:
# Or (use a pipe)
olympics_df.loc[((olympics_df["bmi"] < 19) | (olympics_df["bmi"] > 29)) & (olympics_df["medal_ct"] >0), "sport"].value_counts()

## Archive cleaned and transformed DataFrame

In [None]:
# to save your cleaned data to file
df.to_csv("new_filename.csv")