# Pandas Introduction

In [None]:
import numpy as np
import pandas as pd

## Series object

In [None]:
series_obj = pd.Series([10,20,30,40,50])
series_obj

In [None]:
# index access
series_obj[0]

### Element-wise operations

In [None]:
series_obj + series_obj

In [None]:
series_obj*3

In [None]:
series_obj + 100

### Boolean selection

In [None]:
series_obj

In [None]:
series_obj>40

In [None]:
#boolean access
series_obj[series_obj>40]

## DataFrame object

In [None]:
# create a DataFrame using dictionary (of Series objects)
data = {"Name": ["Tim Miller", "Ann Carter", "Ellen Lee", "Sam Carr", "Al Ball", "Carl Zee", "Sara Martin"], 
        "Gender": ["Male", "Female", "Female", "Male", "Male", "Male", "Female"],
        "Age": [32, 44, 21, 19, 45, 27, 39]}
df = pd.DataFrame(data)
# print(df)  #when using print(), the DataFrame does not display as an HTML table
df

In [None]:
# show first 5 rows
df.head() # == df.head(5)

In [None]:
# show last 5 rows
df.tail()  # == df.tail(5)

In [None]:
# returns a column/Series object
df['Name']     # dictionary notation

In [None]:
df.Name  # attribute notation; Tab completion

In [None]:
# assignment by column (or add a column)
df["Birth Year"] = 1999
df

In [None]:
# assignment by column (or add a column)
df["Married"] = ['Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No']     # must match the length of the DataFrame
df

## Selection and Filtering
### Column selection

In [None]:
# create a new DataFrame
data = pd.DataFrame(np.arange(100).reshape(10,10), columns = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
data

In [None]:
data['a']

In [None]:
data[["a", "e", "j"]]    # providing a list selects multiple columns

In [None]:
data[["j", "e", "a"]]

### Row selection

In [None]:
data[:1]     # use slice syntax to select rows

In [None]:
data[5:9]

In [None]:
#boolean
mask = data["j"] > 40

In [None]:
# boolean selection
data[mask]

### Row and Column selection with loc
Allows you to select a subset of the rows and columns using the label/name of the row/column

In [None]:
data

In [None]:
# loc implies the name/label of the row and column
data.loc[:5, "b"]

In [None]:

data.loc[6:, 'a':'e']     # consecutive (loc selection is inclusive)

In [None]:
data.loc[:, ['c', 'f', 'i']]     # not consecutive

### Row and Column selection with iloc
Allows you to select a subset of the rows and columns using the integer/index position of the row/column

In [None]:
# iloc is for integer/index selection  (iloc selection is exclusive)
data.iloc[:5, 2:5]

In [None]:
data.iloc[:5]  # gives you a row, assumes all of the columns

In [None]:
data.iloc[[5, 0, 3], [9, 5, 0]]  # returns selections in the order listed

# Data Exploration

### Descriptive and summary statistics

In [None]:
drug_df = pd.read_csv("drug_use.csv")

In [None]:
drug_df.head(10)

In [None]:
# get the number of rows and columns in the dataset
drug_df.shape

In [None]:
drug_df.info()

In [None]:
drug_df.describe()

In [None]:
drug_df.describe(include="object")

In [None]:
# returns the unique values
drug_df["age"].unique()

In [None]:
# returns the unique values in order
set(drug_df["age"])

### Analyze by grouping (pivoting) features

In [None]:
drug_df[['gender', 'nicotine_use']].groupby(['gender'], as_index=False).mean().sort_values(by='nicotine_use', ascending=False)

In [None]:
drug_df[['age', 'alcohol_use']].groupby(['age'],).mean().sort_values(by='alcohol_use', ascending=False)


In [None]:
drug_df[['Openness', 'cannabis_use']].groupby(['Openness'],).mean().sort_values(by='cannabis_use', ascending=False)


In [None]:
drug_df[['education', 'coke_use']].groupby(['education'],).mean().sort_values(by='coke_use', ascending=False)


### Boolean Selection 

In [None]:
# use count() to get the quantity of items in a Series/column
drug_df.loc[drug_df["gender"] == "male", "gender"].count()

In [None]:
# to get a characteristic's percent of a feature
drug_df.loc[drug_df["gender"] == "male", "gender"].count()/drug_df["gender"].count()

In [None]:
# And (&)

drug_df.loc[(drug_df["Conscientiousness"] == "low") & (drug_df["cannabis_use"]==1), "Conscientiousness"].count() \
/drug_df.loc[drug_df["Conscientiousness"] == "low", "Conscientiousness"].count()

In [None]:

drug_df.loc[(drug_df["Sensation Seeking"] == "high") & (drug_df["coke_use"]==1), "Sensation Seeking"].count() \
/drug_df.loc[drug_df["Sensation Seeking"] == "high", "Sensation Seeking"].count()

In [None]:
# Or (|)
drug_df.loc[(drug_df["Neuroticism"] == "high") | (drug_df["Impulsiveness"]=="high") & (drug_df["nicotine_use"] == 1), "nicotine_use"].count()

In [None]:
drug_df.loc[(drug_df["age"] == "65+") & ((drug_df["cannabis_use"]==1) | (drug_df["coke_use"]==1)) , "coke_use"].count()

In [None]:
# use value_counts() to break out the total count by category
drug_df.loc[drug_df["cannabis_use"]==1, "education"].value_counts()

In [None]:
drug_df.loc[drug_df["Impulsiveness"].isin(["high", "low"]) & (drug_df["coke_use"] == 1), "Impulsiveness"].value_counts()

In [None]:
drug_df.loc[drug_df["cannabis_use"]==1, "age"].value_counts()

In [None]:
drug_df.loc[drug_df["chocolate_use"] == 1, "gender"].value_counts()

In [None]:
drug_df.loc[drug_df["chocolate_use"] == 0, "gender"].value_counts()

## One-Hot Encoding Using Pandas

Works with categorical strings or categorical numbers.

In [None]:
# use pandas to do one-hot encoding

# DEFAULTS:
    # prefix_sep='_' 
    # columns=None   ... will encode all columns with categorical variables
    # drop_first=False
# returns a DataFrame

one_hot_drug = pd.get_dummies(drug_df, columns=["Extroversion"])
one_hot_drug.head()

In [None]:
# You can change the name/prefix of the one-hot encoded feature
pd.get_dummies(drug_df["Extroversion"], prefix='Ex -')

## Archive cleaned and transformed DataFrame

In [None]:
# to save your cleaned data to file
df.to_csv("new_filename.csv")