# Setup

In [1]:
# Dependencies

from sqlalchemy import create_engine, inspect
import pandas as pd
import warnings

In [2]:
# Setting to reveal up to 500 rows in notebook

pd.set_option('display.max_rows', 500)

In [3]:
# Silence pink warnings

warnings.filterwarnings('ignore')

# Functions

In [4]:
def basic_clean(a_df):
    '''Performs initial cleaning of dataframe'''
    
    a_df = a_df.drop_duplicates(keep="first")
    
    return a_df

In [5]:
def all_caps(a_df, a_list):
    '''Renders string df columns in supplied list in all caps, returns dataframe'''
    
    for col in a_list:
        a_df[col] = a_df[col].str.upper()
        
    return a_df

In [6]:
def vc(a_ser):
    '''Return value_counts().to_frame() for a series'''
    
    return a_ser.value_counts().to_frame()

In [7]:
def clean_isbns(a_ser):
    '''Renders isbn series data as str and strips unwanted chars, returns a series'''
    
    a_ser = a_ser.astype("str") 
    a_ser = a_ser.str.rstrip(".0")
    
    return a_ser    

In [8]:
def clean_prices(a_ser):
    '''Strips unwanted chars from price data and renders as float, returns a series'''
    
    a_ser = a_ser.astype("str") 
    a_ser = a_ser.str.lstrip("$")
    a_ser = a_ser.str.replace(",", "", regex=False)
    a_ser = a_ser.str.replace("PRICE NOT YET AVAILABLE**", "0.01", regex=False)
    a_ser = a_ser.str.replace("PREPAID", "0.01", regex=False)
    a_ser = a_ser.apply(lambda x: float(x))

    return a_ser

# Import courses_#.csv files into dataframe, clean and transform data

### *courses_df*

In [9]:
# Read CSV files from List

csvs_files = ["Resources\\courses_1.csv", "Resources\\courses_2.csv", "Resources\\courses_3.csv"]

course_csv_list = (pd.read_csv(file) for file in csvs_files)

df1 = pd.concat(course_csv_list, ignore_index=True)

In [10]:
df1

Unnamed: 0.1,Unnamed: 0,university,store_id,catalog_id,campus,campus_id,term,term_id,department,department_id,course,course_id,section,section_id,scanDate
0,0,Ball State University,9301.0,,Ball State University Official Bookstore,334,SPRING 2022,334_1_22_W,ACC,334_1_9,200,200,1,1,2022-02-05 09:46:58
1,1,Ball State University,9301.0,,Ball State University Official Bookstore,334,SPRING 2022,334_1_22_W,ACC,334_1_9,201,201,01,01,2022-02-05 09:46:58
2,2,Ball State University,9301.0,,Ball State University Official Bookstore,334,SPRING 2022,334_1_22_W,ACC,334_1_9,201,201,02,02,2022-02-05 09:46:58
3,3,Ball State University,9301.0,,Ball State University Official Bookstore,334,SPRING 2022,334_1_22_W,ACC,334_1_9,201,201,03,03,2022-02-05 09:46:58
4,4,Ball State University,9301.0,,Ball State University Official Bookstore,334,SPRING 2022,334_1_22_W,ACC,334_1_9,201,201,04,04,2022-02-05 09:46:58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537816,538044,Florida International University,21551.0,10001.0,FIU - Maidique Campus & All Online Courses,19956951,FALL 2022,107903977,PHC,107904486,6328,107916807,RVC,107902500,2022-02-05 09:46:58
537817,538045,Florida International University,21551.0,10001.0,FIU - Maidique Campus & All Online Courses,19956951,FALL 2022,107903977,PHC,107904486,6104C,107916813,RXD,107902492,2022-02-05 09:46:58
537818,538046,Florida International University,21551.0,10001.0,FIU - Maidique Campus & All Online Courses,19956951,FALL 2022,107903977,PHC,107904486,6102,107916814,RXD,107902491,2022-02-05 09:46:58
537819,538047,Florida International University,21551.0,10001.0,FIU - Maidique Campus & All Online Courses,19956951,FALL 2022,107903977,PHC,107904486,6102,107916814,RVC,107902490,2022-02-05 09:46:58


In [11]:
# Checking shape and dtypes of data

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537821 entries, 0 to 537820
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     537821 non-null  int64  
 1   university     537821 non-null  object 
 2   store_id       380343 non-null  float64
 3   catalog_id     55691 non-null   float64
 4   campus         516255 non-null  object 
 5   campus_id      537821 non-null  object 
 6   term           537821 non-null  object 
 7   term_id        537821 non-null  object 
 8   department     537821 non-null  object 
 9   department_id  537821 non-null  object 
 10  course         537821 non-null  object 
 11  course_id      537821 non-null  object 
 12  section        537821 non-null  object 
 13  section_id     537821 non-null  object 
 14  scanDate       537821 non-null  object 
dtypes: float64(2), int64(1), object(12)
memory usage: 61.5+ MB


In [12]:
# Eliminate dupes and drop unneeded columns

df1 = df1.drop_duplicates()

df1 = df1.drop(columns=["Unnamed: 0", "store_id", "catalog_id", "campus", "campus_id", "term_id", "scanDate"])

In [13]:
# Reorder columns and capitalize data where appropriate

df1 = df1[["department_id", "course_id", "section_id", "university", "term", "department", "course", "section"]]

all_caps(df1, ["university", "term", "department", "course", "section"])

Unnamed: 0,department_id,course_id,section_id,university,term,department,course,section
0,334_1_9,200,1,BALL STATE UNIVERSITY,SPRING 2022,ACC,200,1
1,334_1_9,201,01,BALL STATE UNIVERSITY,SPRING 2022,ACC,201,01
2,334_1_9,201,02,BALL STATE UNIVERSITY,SPRING 2022,ACC,201,02
3,334_1_9,201,03,BALL STATE UNIVERSITY,SPRING 2022,ACC,201,03
4,334_1_9,201,04,BALL STATE UNIVERSITY,SPRING 2022,ACC,201,04
...,...,...,...,...,...,...,...,...
537816,107904486,107916807,107902500,FLORIDA INTERNATIONAL UNIVERSITY,FALL 2022,PHC,6328,RVC
537817,107904486,107916813,107902492,FLORIDA INTERNATIONAL UNIVERSITY,FALL 2022,PHC,6104C,RXD
537818,107904486,107916814,107902491,FLORIDA INTERNATIONAL UNIVERSITY,FALL 2022,PHC,6102,RXD
537819,107904486,107916814,107902490,FLORIDA INTERNATIONAL UNIVERSITY,FALL 2022,PHC,6102,RVC


In [14]:
# Examine value counts on term column to begin consolidation into categories 

vc(df1["term"])

Unnamed: 0,term
SPRING 2022,287183
FALL 2021,67170
FALL 2022,19304
FALL 21,16880
WINTER - 2022,12627
SPRING '22,12417
WINTER 2022,12220
SUMMER 2022,11341
SPRING - 2022,10979
FALL SEMESTER 2021,9352


In [15]:
# Use boolean mask to consolidate Spring 2022 term data 

spring_mask = df1["term"].str.contains("SPRING")

df1.loc[spring_mask, "term"] = "SPRING"


fall_mask = df1["term"].str.contains("FALL")

df1.loc[fall_mask, "term"] = "FALL"


summer_mask = df1["term"].str.contains("SUMMER")

df1.loc[summer_mask, "term"] = "SUMMER"


winter_mask = df1["term"].str.contains("WINTER")

df1.loc[winter_mask, "term"] = "WINTER"



junk_mask = ~(df1["term"].isin(["SPRING", "FALL", "SUMMER", "WINTER"]))

df1.loc[junk_mask, "term"] = "JUNK"

df1 = df1.loc[~(df1["term"]=="JUNK"), :]


vc(df1["term"])

Unnamed: 0,term
SPRING,366147
FALL,114891
WINTER,27441
SUMMER,15489


In [16]:
df1 = df1.reset_index(drop=True)

In [17]:
courses_df = df1.copy()

In [18]:
courses_df

Unnamed: 0,department_id,course_id,section_id,university,term,department,course,section
0,334_1_9,200,1,BALL STATE UNIVERSITY,SPRING,ACC,200,1
1,334_1_9,201,01,BALL STATE UNIVERSITY,SPRING,ACC,201,01
2,334_1_9,201,02,BALL STATE UNIVERSITY,SPRING,ACC,201,02
3,334_1_9,201,03,BALL STATE UNIVERSITY,SPRING,ACC,201,03
4,334_1_9,201,04,BALL STATE UNIVERSITY,SPRING,ACC,201,04
...,...,...,...,...,...,...,...,...
523963,107904486,107916807,107902500,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6328,RVC
523964,107904486,107916813,107902492,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6104C,RXD
523965,107904486,107916814,107902491,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6102,RXD
523966,107904486,107916814,107902490,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6102,RVC


# Import textbooks_#.csv files into dataframe, clean and transform data

### *textbooks_df*

In [19]:
# Read CSV files from List

csvs_files = ["Resources\\textbooks_1.csv", "Resources\\textbooks_2.csv", "Resources\\textbooks_3.csv",\
              "Resources\\textbooks_4.csv", "Resources\\textbooks_5.csv"]

course_csv_list = (pd.read_csv(file) for file in csvs_files)

df2 = pd.concat(course_csv_list, ignore_index=True)

In [20]:
df2

Unnamed: 0.1,Unnamed: 0,store_id,catalog_id,campus_id,term_id,department_id,course_id,section_id,title,edition,publisher,book_type,book_url,book_id,no_textbook_message,recommend_type,ISBN,price,scanDate
0,0,9301.0,,334,334_1_22_W,334_1_9,200,1,Cengage Unlimited - Access (1 Semester),18,CENGAGE U,Digital Purchase,https://bsu.bncollege.com/c/Cengage-Unlimited-...,MBS_5186058_dg,,eBook Option,9.780358e+12,$119.99,2022-02-05 09:46:58
1,1,9301.0,,334,334_1_22_W,334_1_9,201,01,Financial Accounting: Information for Decision...,10TH 21,MCG CUSTOM,New Print,https://bsu.bncollege.com/c/Financial-Accounti...,MBS_5780024_new,,REQUIRED,9.781264e+12,$107.90,2022-02-05 09:46:58
2,2,9301.0,,334,334_1_22_W,334_1_9,201,02,Financial Accounting: Information for Decision...,10TH 21,MCG CUSTOM,New Print,https://bsu.bncollege.com/c/Financial-Accounti...,MBS_5780024_new,,REQUIRED,9.781264e+12,$107.90,2022-02-05 09:46:58
3,3,9301.0,,334,334_1_22_W,334_1_9,201,03,Financial Accounting: Information for Decision...,10TH 21,MCG CUSTOM,New Print,https://bsu.bncollege.com/c/Financial-Accounti...,MBS_5780024_new,,REQUIRED,9.781264e+12,$107.90,2022-02-05 09:46:58
4,4,9301.0,,334,334_1_22_W,334_1_9,201,04,Financial Accounting: Information for Decision...,10TH 21,MCG CUSTOM,New Print,https://bsu.bncollege.com/c/Financial-Accounti...,MBS_5780024_new,,REQUIRED,9.781264e+12,$107.90,2022-02-05 09:46:58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990722,992670,88196.0,10001.0,75156912,106477925,106757452,106762860,106737761,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,550036485535.0,,REQUIRED,9.781646e+12,$45.60,2022-02-05 09:46:58
990723,992671,88196.0,10001.0,75156912,106477925,106757452,106762860,106737762,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,550036485535.0,,REQUIRED,9.781646e+12,$45.60,2022-02-05 09:46:58
990724,992672,88196.0,10001.0,75156912,106477925,106757452,106762860,106737763,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,550036485535.0,,REQUIRED,9.781646e+12,$45.60,2022-02-05 09:46:58
990725,992673,88196.0,10001.0,75156912,106477925,106757452,106762860,106737764,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,550036485535.0,,REQUIRED,9.781646e+12,$45.60,2022-02-05 09:46:58


In [21]:
# Checking shape and dtypes of data

df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990727 entries, 0 to 990726
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           990727 non-null  int64  
 1   store_id             669440 non-null  float64
 2   catalog_id           121358 non-null  float64
 3   campus_id            990727 non-null  object 
 4   term_id              990727 non-null  object 
 5   department_id        990727 non-null  object 
 6   course_id            990727 non-null  object 
 7   section_id           990727 non-null  object 
 8   title                645255 non-null  object 
 9   edition              633950 non-null  object 
 10  publisher            644969 non-null  object 
 11  book_type            645255 non-null  object 
 12  book_url             645255 non-null  object 
 13  book_id              645255 non-null  object 
 14  no_textbook_message  345472 non-null  object 
 15  recommend_type   

In [22]:
# Eliminate dupes and drop unneeded columns

df2 = df2.drop_duplicates()

df2 = df2.drop(columns=\
    ["Unnamed: 0", "store_id", "catalog_id", "campus_id", "term_id", "book_id",\
     "no_textbook_message", "recommend_type", "scanDate"])

In [23]:
# Reorder columns and capitalize data where appropriate

all_caps(df2, ["title", "edition", "publisher", "book_type", "price"])

Unnamed: 0,department_id,course_id,section_id,title,edition,publisher,book_type,book_url,ISBN,price
0,334_1_9,200,1,CENGAGE UNLIMITED - ACCESS (1 SEMESTER),18,CENGAGE U,DIGITAL PURCHASE,https://bsu.bncollege.com/c/Cengage-Unlimited-...,9.780358e+12,$119.99
1,334_1_9,201,01,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9.781264e+12,$107.90
2,334_1_9,201,02,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9.781264e+12,$107.90
3,334_1_9,201,03,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9.781264e+12,$107.90
4,334_1_9,201,04,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9.781264e+12,$107.90
...,...,...,...,...,...,...,...,...,...,...
990722,106757452,106762860,106737761,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,9.781646e+12,$45.60
990723,106757452,106762860,106737762,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,9.781646e+12,$45.60
990724,106757452,106762860,106737763,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,9.781646e+12,$45.60
990725,106757452,106762860,106737764,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,9.781646e+12,$45.60


In [24]:
# Drop any rows where title, ISBN, and price are ALL missing (i.e., not much use for analysis)

df2 = df2.dropna(axis=0, subset=["title", "ISBN", "price"], how="all")

In [25]:
# Fill in missing values with default values  

df2 = df2.fillna(value={"edition":"unknown", "publisher":"unknown", "ISBN": 0.0, "price":"0.01"})

In [26]:
# Rename ISBN column to isbn 

df2 = df2.rename(columns={"ISBN":"isbn"})

In [27]:
# Transform isbn data

df2["isbn"] = clean_isbns(df2["isbn"])

In [28]:
# Transform price data

df2["price"] = clean_prices(df2["price"])

In [29]:
# Take a random sample of our dataframe to see if transformations are effective

df2.sample(50)

Unnamed: 0,department_id,course_id,section_id,title,edition,publisher,book_type,book_url,isbn,price
130550,774_1_112,302,1001,SIMPLE STATISTICS : APPLICATIONS IN CRIMINOLOG...,07,OXF,NEW PRINT,https://unlv.bncollege.com/c/Simple-Statistics...,9780195330717,114.99
808783,8085_1_505,3010,027(8466),WHEN THE LITTLE THINGS COUNT . . . AND THEY AL...,07,HACHETTE B,USED PRINT RENTAL,https://unt.bncollege.com/c/When-the-Little-Th...,9781569242902,3.2
414148,450_450_235,302,101,INFORMATION TECHNOLOGY PROJECT MANAGEMENT,9TH 19,CENGAGE L,USED PRINT,https://psu.bncollege.com/c/Information-Techno...,9781337101356,117.0
511743,452_452_19,135,001,I SHALL NOT BE MOVED,91,PENG RAND,USED PRINT RENTAL,https://psu.bncollege.com/c/I-Shall-Not-Be-Mov...,9780553354584,5.3
467431,447_447_110,200,001,MATLAB PROGRAMMING FOR ENGINEERS,6TH 20,CENGAGE L,USED PRINT RENTAL,https://psu.bncollege.com/c/MATLAB-Programming...,9780357030394,55.5
324869,330_1_250,261,001,NO-DRAMA DISCIPLINE,14,PENG RAND,USED PRINT,https://liberty.bncollege.com/c/No-Drama-Disci...,9780345548061,7.91
648539,740_1_645,5740,24848,STUDY MANUAL PROGRAM FOR EXAMINATION FM (LOOSE...,15TH 21,ACTEX,USED PRINT,https://wayne.bncollege.com/c/Study-Manual-Pro...,9781647563929,141.43
910845,107331272,107331756,107324648,CRIMINALISTICS:INTRO.TO FORENSIC SCI.,12TH 18,PEARSON,RENT NEW,https://fiu.bncollege.com/shop/fiu/textbook/cr...,9780134477596,218.4
476270,204_1_305,3773,03,VOICES IN THE DARK,17,PENG RAND,USED PRINT RENTAL,https://msstate.bncollege.com/c/Voices-in-the-...,9781681371054,13.2
145354,8011_1_168,152,1,DRINK CULTURA : CHICANISMO,92,INGRAM PUB,NEW PRINT,https://sjsu.bncollege.com/c/Drink-Cultura--Ch...,9781877741074,15.15


In [30]:
df2 = df2.reset_index(drop=True)

In [31]:
textbooks_df = df2.copy()

In [32]:
textbooks_df.head(5)

Unnamed: 0,department_id,course_id,section_id,title,edition,publisher,book_type,book_url,isbn,price
0,334_1_9,200,1,CENGAGE UNLIMITED - ACCESS (1 SEMESTER),18,CENGAGE U,DIGITAL PURCHASE,https://bsu.bncollege.com/c/Cengage-Unlimited-...,9780357700006,119.99
1,334_1_9,201,1,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.9
2,334_1_9,201,2,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.9
3,334_1_9,201,3,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.9
4,334_1_9,201,4,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.9


In [33]:
textbooks_df.sample(50)

Unnamed: 0,department_id,course_id,section_id,title,edition,publisher,book_type,book_url,isbn,price
44478,480_1_172,120,C3,RULES FOR WRITERS,10TH 22,MAC HIGHER,NEW PRINT RENTAL,https://bu.bncollege.com/c/Rules-for-Writers/p...,9781319244255,48.25
335246,572_1_276,199,294,SKYEPACK KINE COMPLETE SPORT ACCESS >I<,NON-EXP,SKYEPACK,NEW PRINT,https://tamu.bncollege.com/c/SKYEPACK-KINE-COM...,2819480261133,46.67
8931,334_1_518,160,6,ICLICKER2 STUDENT REMOTE REV.,17,MAC HIGHER,USED PRINT RENTAL,https://bsu.bncollege.com/c/iClicker2-Student-...,9781498603041,38.75
317504,572_1_183,404,501,ENVIRONMENTAL ENGINEERING: PRINCIPLES AND PRAC...,14,WILEY P CC,DIGITAL PURCHASE,https://tamu.bncollege.com/c/Environmental-Eng...,9781118785997,86.0
349267,572_1_367,200,596,PACKBACK QUESTIONS ACCESS CODE >I<,W22,PACKBACK,NEW PRINT,https://tamu.bncollege.com/c/PACKBACK-QUESTION...,2819480261218,38.67
410395,619_1_90,11120,2,ASTRONOMY (OER),18,XANEDU C,USED PRINT,https://rowanbookstore.bncollege.com/c/Astrono...,9781938168284,43.5
44770,480_1_172,151,B1,PASSING,03,PENG RAND,USED PRINT,https://bu.bncollege.com/c/Passing/p/MBS_61938...,9780142437278,10.5
236480,396_1_170,1202,003,CAMPBELL BIOLOGY - WITH MOD. MASTERBIOLOGY,11TH 17,PEARSON,NEW PRINT,https://lsu.bncollege.com/c/Campbell-Biology--...,9780134683461,325.85
306778,572_1_171,203,516,JANE,15,PENG RAND,USED PRINT RENTAL,https://tamu.bncollege.com/c/Jane/p/MBS_193127...,9780143107941,7.5
50274,480_1_262,743,A1,INTRODUCTION TO STATISTICAL LEARNING: WITH APP...,13,SPRINGER,NEW PRINT,https://bu.bncollege.com/c/Introduction-to-Sta...,978146147137,79.99


# Merge courses and textbooks dataframes with inner join

### *merge_courses_and_textbooks_df*

In [34]:
# Inner merge on both dataframes on common id fields

merge_df = pd.merge(courses_df, textbooks_df, on=["department_id", "course_id", "section_id"], how="inner")

In [35]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 610343 entries, 0 to 610342
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   department_id  610343 non-null  object 
 1   course_id      610343 non-null  object 
 2   section_id     610343 non-null  object 
 3   university     610343 non-null  object 
 4   term           610343 non-null  object 
 5   department     610343 non-null  object 
 6   course         610343 non-null  object 
 7   section        610343 non-null  object 
 8   title          610343 non-null  object 
 9   edition        610343 non-null  object 
 10  publisher      610343 non-null  object 
 11  book_type      610343 non-null  object 
 12  book_url       610343 non-null  object 
 13  isbn           610343 non-null  object 
 14  price          610343 non-null  float64
dtypes: float64(1), object(14)
memory usage: 74.5+ MB


In [36]:
merge_df.drop_duplicates()

Unnamed: 0,department_id,course_id,section_id,university,term,department,course,section,title,edition,publisher,book_type,book_url,isbn,price
0,334_1_9,200,1,BALL STATE UNIVERSITY,SPRING,ACC,200,1,CENGAGE UNLIMITED - ACCESS (1 SEMESTER),18,CENGAGE U,DIGITAL PURCHASE,https://bsu.bncollege.com/c/Cengage-Unlimited-...,9780357700006,119.99
1,334_1_9,201,01,BALL STATE UNIVERSITY,SPRING,ACC,201,01,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.90
2,334_1_9,201,02,BALL STATE UNIVERSITY,SPRING,ACC,201,02,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.90
3,334_1_9,201,03,BALL STATE UNIVERSITY,SPRING,ACC,201,03,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.90
4,334_1_9,201,04,BALL STATE UNIVERSITY,SPRING,ACC,201,04,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610338,107904486,107916708,107902516,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6530,U01,KOTCH'S MATERNAL+CHILD HEALTH,4TH 22,JONES+BART,BUY DIGITAL,https://fiu.bncollege.com/shop/BNCB_TextbookDe...,9781284200256,59.12
610339,107904486,107916805,107902503,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6374,RXC,HAZARD MITIGATION+PREPAREDNESS,2ND 17,TAYLOR,BUY USED,https://fiu.bncollege.com/shop/fiu/textbook/ha...,9781466595569,90.00
610340,107904486,107916805,107902503,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6374,RXC,HAZARD MITIGATION+PREPAREDNESS,2ND 17,TAYLOR,BUY NEW,https://fiu.bncollege.com/shop/fiu/textbook/ha...,9781466595569,120.00
610341,107904486,107916805,107902504,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6374,RX02,HAZARD MITIGATION+PREPAREDNESS,2ND 17,TAYLOR,BUY USED,https://fiu.bncollege.com/shop/fiu/textbook/ha...,9781466595569,90.00


In [37]:
merge_df = merge_df[["university", "term", "department", "course", "section", "title", "edition", "price",\
                     "isbn", "publisher", "book_type", "book_url"]]

In [38]:
merge_df.sample(50)

Unnamed: 0,university,term,department,course,section,title,edition,price,isbn,publisher,book_type,book_url
186048,KENT STATE UNIVERSITY KENT CAMPUS,FALL,ECED,44444,002,COMING OF AGE: THE EDUCATION AND DEVELOPMENT O...,07,22.87,9781560902119,AMLE,USED PRINT,https://kent.bncollege.com/c/Coming-of-Age-The...
397740,UNLV BOOKSTORE,SPRING,NURS,740R,1001,PRACTICE GUIDELINES FOR FAMILY NURSE PRACTITIO...,5TH 20,65.24,9780323554947,ELSEVIER,USED PRINT,https://unlv.bncollege.com/c/Practice-Guidelin...
425235,UNIVERSITY OF MISSISSIPPI,SPRING,MGMT,495,SECTION 3,HONEST TRUTH ABOUT DISHONESTY,12,8.0,9780062183613,HARP PUB,USED PRINT RENTAL,https://ole-miss.bncollege.com/c/Honest-Truth-...
483633,GEORGE MASON UNIVERSITY,SPRING,AVT,413,002,AIGA PROFESSIONAL PRACTICES IN GRAPHIC DESIGN,2ND 08,24.7,9781581155099,S+S,NEW PRINT RENTAL,https://gmu.bncollege.com//c/Aiga-Professional...
561672,UNIVERSITY OF NORTH CAROLINA AT CHARLOTTE,FALL,LBST,2101,213,THIRTY YEARS A SLAVE : FROM BONDAGE TO FREEDOM,02,7.05,9781588380913,NEW SOUTH,USED PRINT RENTAL,https://charlotte.bncollege.com/c/Thirty-Years...
87428,INDIANA UNIVERSITY BLOOMINGTON,FALL,ENG-W,131,23015,ENGLISH GRAMMAR AND PUNCTUATION,12,6.6,9781423218654,BARCHARTS,NEW PRINT,https://iub.bncollege.com/c/English-Grammar-an...
420955,UNIVERSITY OF MISSISSIPPI,SPRING,ENG,222,SECTION 2,CONCERT OF VOICES,2ND 09,24.9,9781551119779,BROADVIEW,USED PRINT RENTAL,https://ole-miss.bncollege.com/c/Concert-of-Vo...
410255,UPENN,SPRING,NURS,547,002,READING RESEARCH: A USER-FRIENDLY GUIDE FOR HE...,7TH 21,11.99,9780323759243,ELSEVIER,USED PRINT,https://upenn.bncollege.com/c/Reading-Research...
212973,PENN STATE UNIVERSITY PARK,SPRING,FIN,301,005,FUNDAMENTALS OF CORPORATE FINANCE (LOOSELEAF),10TH 20,170.56,97812607039,MCG,NEW PRINT,https://psu.bncollege.com/c/Fundamentals-of-Co...
1756,BALL STATE UNIVERSITY,SPRING,CHEM,230,11,ESSENTIAL ORGANIC CHEMISTRY,3RD 16,259.99,9780321937711,PEARSON,NEW PRINT,https://bsu.bncollege.com/c/Essential-Organic-...


In [39]:
merge_df = merge_df.reset_index(drop=True)

In [40]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 610343 entries, 0 to 610342
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   university  610343 non-null  object 
 1   term        610343 non-null  object 
 2   department  610343 non-null  object 
 3   course      610343 non-null  object 
 4   section     610343 non-null  object 
 5   title       610343 non-null  object 
 6   edition     610343 non-null  object 
 7   price       610343 non-null  float64
 8   isbn        610343 non-null  object 
 9   publisher   610343 non-null  object 
 10  book_type   610343 non-null  object 
 11  book_url    610343 non-null  object 
dtypes: float64(1), object(11)
memory usage: 55.9+ MB


#### Connect to engine

# Import Security 
from config.py import user

In [41]:
protocol = "postgresql"
user = "postgres"
password = "bootcamp"
host = "localhost"
port = "5432"
db = "library_db"
conn_string = f'{protocol}://{user}:{password}@{host}:{port}/{db}'
engine = create_engine(conn_string)

In [42]:
inspector = inspect(engine)
inspector.get_table_names()

['course_textbooks']

In [44]:
merge_df.to_sql(name='course_textbooks', con=engine, if_exists='append', index=False)

343

In [49]:
pd.read_sql_query('select * from course_textbooks', con=engine).head(5)

Unnamed: 0,id,university,term,department,course,section,title,edition,price,isbn,publisher,book_type,book_url
0,1,BALL STATE UNIVERSITY,SPRING,ACC,200,1,CENGAGE UNLIMITED - ACCESS (1 SEMESTER),18,119.99,9780357700006,CENGAGE U,DIGITAL PURCHASE,https://bsu.bncollege.com/c/Cengage-Unlimited-...
1,2,BALL STATE UNIVERSITY,SPRING,ACC,201,1,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,107.9,9781264273881,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...
2,3,BALL STATE UNIVERSITY,SPRING,ACC,201,2,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,107.9,9781264273881,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...
3,4,BALL STATE UNIVERSITY,SPRING,ACC,201,3,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,107.9,9781264273881,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...
4,5,BALL STATE UNIVERSITY,SPRING,ACC,201,4,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,107.9,9781264273881,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...
