# Setup

In [None]:
# Dependencies

import pandas as pd
import warnings

In [None]:
# Setting to reveal up to 500 rows in notebook

pd.set_option('display.max_rows', 500)

In [None]:
# Silence pink warnings

warnings.filterwarnings('ignore')

# Functions

In [None]:
def basic_clean(a_df):
    '''Performs initial cleaning of dataframe'''
    
    a_df = a_df.drop_duplicates(keep="first")
    
    return a_df

In [None]:
def all_caps(a_df, a_list):
    '''Renders string df columns in supplied list in all caps, returns dataframe'''
    
    for col in a_list:
        a_df[col] = a_df[col].str.upper()
        
    return a_df

In [None]:
def vc(a_ser):
    '''Return value_counts().to_frame() for a series'''
    
    return a_ser.value_counts().to_frame()

In [None]:
def clean_isbns(a_ser):
    '''Renders isbn series data as str and strips unwanted chars, returns a series'''
    
    a_ser = a_ser.astype("str") 
    a_ser = a_ser.str.rstrip(".0")
    
    return a_ser    

In [None]:
def clean_prices(a_ser):
    '''Strips unwanted chars from price data and renders as float, returns a series'''
    
    a_ser = a_ser.astype("str") 
    a_ser = a_ser.str.lstrip("$")
    a_ser = a_ser.str.replace(",", "", regex=False)
    a_ser = a_ser.str.replace("PRICE NOT YET AVAILABLE**", "0.01", regex=False)
    a_ser = a_ser.str.replace("PREPAID", "0.01", regex=False)
    a_ser = a_ser.apply(lambda x: float(x))

    return a_ser

# Import courses.csv into dataframe, clean and transform data

### *courses_df*

In [None]:
# This csv is large, so we need to include low-memory=False when rendering as dataframe

# NOTE: enter correct filepath for your downloaded csv file

filepath1 = "C:\\BNCollegeCourses_2022-02-05.csv"

df1 = basic_clean(pd.read_csv(filepath1, encoding="Utf-8", low_memory=False))

In [None]:
# Checking shape and dtypes of data

df1.info()

In [None]:
# Eliminate dupes and drop unneeded columns

df1 = df1.drop_duplicates()

df1 = df1.drop(columns=["store_id", "catalog_id", "campus", "campus_id", "term_id", "scanDate"])

In [None]:
# Reorder columns and capitalize data where appropriate

df1 = df1[["department_id", "course_id", "section_id", "university", "term", "department", "course", "section"]]

all_caps(df1, ["university", "term", "department", "course", "section"])

In [None]:
# Examine value counts on term column to begin consolidation into categories 

vc(df1["term"])

In [None]:
# Use boolean masks to consolidate into manageable term categories: FALL, WINTER, SPRING, SUMMER 

fall_mask = df1["term"].str.contains("FALL")

df1.loc[fall_mask, "term"] = "FALL"


spring_mask = df1["term"].str.contains("SPRING")

df1.loc[spring_mask, "term"] = "SPRING"


summer_mask = df1["term"].str.contains("SUMMER")

df1.loc[summer_mask, "term"] = "SUMMER"


winter_mask = df1["term"].str.contains("WINTER")

df1.loc[winter_mask, "term"] = "WINTER"


junk_mask = ~(df1["term"].isin(["FALL", "WINTER", "SPRING", "SUMMER"]))

df1.loc[junk_mask, "term"] = "JUNK"

df1 = df1.loc[~(df1["term"]=="JUNK"), :]

vc(df1["term"])

In [None]:
# Render some columns as categorical data type - saves memory

df1["term"] = df1["term"].astype('category')

df1["university"] = df1["university"].astype('category')

df1["department"] = df1["department"].astype('category')

df1["course"] = df1["course"].astype('category')

df1["section"] = df1["section"].astype('category')

df1.info()

In [None]:
df1 = df1.reset_index(drop=True)

In [None]:
courses_df = df1.copy()

In [None]:
courses_df

In [None]:
# Render as csv file for import into PostGresSQL

courses_df.to_csv("courses.csv", encoding="utf-8")

# 39 MB csv

# Import textbooks.csv into dataframe, clean and transform data

### *textbooks_df*

In [None]:
# This csv is large, so we need to include low-memory=False when rendering as dataframe

# NOTE: enter correct filepath for your downloaded csv file

filepath2 = "C:\\BNTextbook_2022-02-05.csv"

df2 = basic_clean(pd.read_csv(filepath2, encoding="Utf-8", low_memory=False))

In [None]:
# Checking shape and dtypes of data

df2.info()

In [None]:
# Eliminate dupes and drop unneeded columns

df2 = df2.drop_duplicates()

df2 = df2.drop(columns=\
    ["store_id", "catalog_id", "campus_id", "term_id", "book_id",\
     "no_textbook_message", "recommend_type", "scanDate"])

In [None]:
# Reorder columns and capitalize data where appropriate

all_caps(df2, ["title", "edition", "publisher", "book_type", "price"])

In [None]:
# Drop any rows where title, ISBN, and price are ALL missing (i.e., not much use for analysis)

df2 = df2.dropna(axis=0, subset=["title", "ISBN", "price"], how="all")

In [None]:
# Fill in missing values with default values  

df2 = df2.fillna(value={"edition":"unknown", "publisher":"unknown", "ISBN": 0.0, "price":"0.01"})

In [None]:
# Rename ISBN column to isbn 

df2 = df2.rename(columns={"ISBN":"isbn"})

In [None]:
# Transform isbn data

df2["isbn"] = clean_isbns(df2["isbn"])

In [None]:
# Transform price data

df2["price"] = clean_prices(df2["price"])

In [None]:
# Take a random sample of our dataframe to see if transformations are effective

df2.sample(50)

In [None]:
# Examine dtypes and value_counts for columns

print(f"DATA TYPES:\n\n{df2.dtypes}\n\n")

for col in df2.columns.tolist():
    print(f"COLUMN: {col}\n")
    print(f"=================================\n")
    print(vc(df2[col]))
    print(f"\n\n")

In [None]:
# Render some columns as categorical data type - saves memory

df2["book_type"] = df2["book_type"].astype('category')
df2["edition"] = df2["edition"].astype('category')
df2["publisher"] = df2["publisher"].astype('category')
df2["isbn"] = df2["isbn"].astype('category')
df2["title"] = df2["title"].astype('category')

df2.info()

In [None]:
df2 = df2.reset_index(drop=True)

In [None]:
textbooks_df = df2.copy()

In [None]:
textbooks_df.head(5)

In [None]:
textbooks_df.sample(50)

In [None]:
# Render as csv file for import into PostGresSQL

textbooks_df.to_csv("textbooks.csv", encoding="utf-8")

# 172 MB csv

# Merge courses and textbooks dataframes with inner join

### *merge_courses_and_textbooks_df*

In [None]:
# Inner merge on both dataframes on common id fields

merge_df = pd.merge(courses_df, textbooks_df, on=["department_id", "course_id", "section_id"], how="inner")

In [None]:
merge_df.info()

In [None]:
merge_df.drop_duplicates()

In [None]:
merge_df = merge_df[["university", "term", "department", "course", "section", "title", "edition", "price",\
                     "isbn", "publisher", "book_type", "book_url"]]

In [None]:
merge_df.sample(50)

In [None]:
merge_df = merge_df.reset_index(drop=True)

In [None]:
merge_df.info()

In [None]:
course_textbooks = merge_df.copy()

In [None]:
# Render as csv file for import into PostGresSQL

course_textbooks.to_csv("/Resources/course_textbooks.csv", encoding="utf-8")

In [None]:
# 204 MB csv