In [None]:
import pandas as pd

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
def basic_clean(a_df):
    '''Performs initial cleaning of dataframe'''
    
    a_df = a_df.drop_duplicates(keep="first")
    
    return a_df

In [None]:
def all_caps(a_df, a_list):
    '''Renders string df columns in supplied list in all caps, returns dataframe'''
    
    for col in a_list:
        a_df[col] = a_df[col].str.upper()
        
    return a_df

In [None]:
def vc(a_ser):
    '''Return value_counts().to_frame() for a series'''
    
    return a_ser.value_counts().to_frame()

In [None]:
def clean_isbns(a_ser):
    '''Renders isbn series data as str and strips unwanted chars, returns a series'''
    
    a_ser = a_ser.astype("str") 
    a_ser = a_ser.str.rstrip(".0")
    
    return a_ser    

In [None]:
def clean_prices(a_ser):
    '''Strips unwanted chars from price data and renders as float, returns a series'''
    
    a_ser = a_ser.astype("str") 
    a_ser = a_ser.str.lstrip("$")
    a_ser = a_ser.str.replace(",", "", regex=False)
    a_ser = a_ser.str.replace("PRICE NOT YET AVAILABLE**", "0.01", regex=False)
    a_ser = a_ser.str.replace("PREPAID", "0.01", regex=False)
    a_ser = a_ser.apply(lambda x: float(x))

    return a_ser

In [None]:
filepath1 = "C:\\Users\\ej2595ht\\Desktop\\textbook_data\\BNCollegeCourses_2022-02-05.csv"
filepath2 = "C:\\Users\\ej2595ht\\Desktop\\textbook_data\\BNTextbook_2022-02-05.csv"

In [None]:
df1 = basic_clean(pd.read_csv(filepath1, encoding="Utf-8", low_memory=False))

In [None]:
df2 = basic_clean(pd.read_csv(filepath2, encoding="Utf-8", low_memory=False))

# course_df

In [None]:
df1.info()

In [None]:
df1 = df1.drop_duplicates()

In [None]:
df1 = df1.drop(columns=["store_id", "catalog_id", "campus", "campus_id", "term_id", "section_id", "scanDate"])

In [None]:
df1.info()

In [None]:
df1 = df1[["department_id", "course_id", "university", "term", "department", "course", "section"]]

In [None]:
df1 = df1.reset_index(drop=True)

In [None]:
all_caps(df1, ["university", "term", "department", "course", "section"])

In [None]:
vc(df1["term"])

In [None]:
spring_mask = df1["term"].str.contains("SPRING")

df1.loc[spring_mask, "term"] = "SPRING"

In [None]:
fall_mask = df1["term"].str.contains("FALL")

df1.loc[fall_mask, "term"] = "FALL"

In [None]:
summer_mask = df1["term"].str.contains("SUMMER")

df1.loc[summer_mask, "term"] = "SUMMER"

In [None]:
winter_mask = df1["term"].str.contains("WINTER")

df1.loc[winter_mask, "term"] = "WINTER"

In [None]:
vc(df1["term"])

In [None]:
junk_mask = ~(df1["term"].isin(["FALL", "WINTER", "SPRING", "SUMMER"]))

df1.loc[junk_mask, "term"] = "JUNK"

In [None]:
vc(df1["term"])

In [None]:
df1 = df1.loc[ ~(df1["term"]=="JUNK")   , :]

In [None]:
vc(df1["term"])

In [None]:
df1 = df1.reset_index(drop=True)

In [None]:
courses_df = df1.copy()

In [None]:
courses_df.info()

# textbooks_df

In [None]:
df2.info()

In [None]:
df2.drop_duplicates()

In [None]:
df2 = df2.drop(columns=["store_id", "catalog_id", "campus_id", "term_id", "section_id", "book_id", "no_textbook_message", "recommend_type", "scanDate"])

In [None]:
df2.info()

In [None]:
all_caps(df2, ["title", "edition", "publisher", "book_type", "price"])

In [None]:
df2.info()

In [None]:
vc(df2["ISBN"])

In [None]:
df2 = df2.dropna(axis=0, subset=["title", "ISBN", "price"], how="all")

In [None]:
df2.info()

In [None]:
df2 = df2.fillna(value={"edition":"unknown", "publisher":"unknown", "ISBN": 0.0, "price":"0.01"})

In [None]:
df2.info()

In [None]:
df2 = df2.rename(columns={"ISBN":"isbn"})

In [None]:
df2["isbn"] = clean_isbns(df2["isbn"])

In [None]:
df2

In [None]:
df2["price"] = clean_prices(df2["price"])

In [None]:
df2

In [None]:
df2.dtypes

In [None]:
df2 = df2.reset_index(drop=True)

In [None]:
textbooks_df = df2.copy()

In [None]:
textbooks_df

In [None]:
textbooks_df.sample(50)

In [None]:
textbooks_df.info()

# merge_df

In [None]:
merge_df = pd.merge(courses_df, textbooks_df, on=["department_id", "course_id"], how="inner")

In [None]:
merge_df.info()

In [None]:
merge_df.drop_duplicates()

In [None]:
merge_df.sample(50)

In [None]:
final_merge_df = merge_df.reset_index(drop=True)

In [None]:
final_merge_df.info()

In [None]:
final_merge_df.to_csv("final_merge_df.csv", encoding="utf-8")

In [None]:
# 2.28 GB csv!