# Setup

In [1]:
# Dependencies

from sqlalchemy import create_engine, inspect
import pandas as pd
import warnings

In [2]:
# Setting to reveal up to 500 rows in notebook

pd.set_option('display.max_rows', 500)

In [3]:
# Silence pink warnings

warnings.filterwarnings('ignore')

# Functions

In [4]:
def basic_clean(a_df):
    '''Performs initial cleaning of dataframe'''
    
    a_df = a_df.drop_duplicates(keep="first")
    
    return a_df

In [5]:
def all_caps(a_df, a_list):
    '''Renders string df columns in supplied list in all caps, returns dataframe'''
    
    for col in a_list:
        a_df[col] = a_df[col].str.upper()
        
    return a_df

In [6]:
def vc(a_ser):
    '''Return value_counts().to_frame() for a series'''
    
    return a_ser.value_counts().to_frame()

In [7]:
def clean_isbns(a_ser):
    '''Renders isbn series data as str and strips unwanted chars, returns a series'''
    
    a_ser = a_ser.astype("str") 
    a_ser = a_ser.str.rstrip(".0")
    
    return a_ser    

In [8]:
def clean_prices(a_ser):
    '''Strips unwanted chars from price data and renders as float, returns a series'''
    
    a_ser = a_ser.astype("str") 
    a_ser = a_ser.str.lstrip("$")
    a_ser = a_ser.str.replace(",", "", regex=False)
    a_ser = a_ser.str.replace("PRICE NOT YET AVAILABLE**", "0.01", regex=False)
    a_ser = a_ser.str.replace("PREPAID", "0.01", regex=False)
    a_ser = a_ser.apply(lambda x: float(x))

    return a_ser

# Import courses_#.csv files into dataframe, clean and transform data

### *courses_df*

In [9]:
# Read CSV files from List

csvs_files = ["Resources\\courses_1.csv", "Resources\\courses_2.csv", "Resources\\courses_3.csv"]

course_csv_list = (pd.read_csv(file) for file in csvs_files)

df1 = pd.concat(course_csv_list, ignore_index=True)

In [10]:
df1

Unnamed: 0.1,Unnamed: 0,university,store_id,catalog_id,campus,campus_id,term,term_id,department,department_id,course,course_id,section,section_id,scanDate
0,0,Ball State University,9301.0,,Ball State University Official Bookstore,334,SPRING 2022,334_1_22_W,ACC,334_1_9,200,200,1,1,2022-02-05 09:46:58
1,1,Ball State University,9301.0,,Ball State University Official Bookstore,334,SPRING 2022,334_1_22_W,ACC,334_1_9,201,201,01,01,2022-02-05 09:46:58
2,2,Ball State University,9301.0,,Ball State University Official Bookstore,334,SPRING 2022,334_1_22_W,ACC,334_1_9,201,201,02,02,2022-02-05 09:46:58
3,3,Ball State University,9301.0,,Ball State University Official Bookstore,334,SPRING 2022,334_1_22_W,ACC,334_1_9,201,201,03,03,2022-02-05 09:46:58
4,4,Ball State University,9301.0,,Ball State University Official Bookstore,334,SPRING 2022,334_1_22_W,ACC,334_1_9,201,201,04,04,2022-02-05 09:46:58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537816,538044,Florida International University,21551.0,10001.0,FIU - Maidique Campus & All Online Courses,19956951,FALL 2022,107903977,PHC,107904486,6328,107916807,RVC,107902500,2022-02-05 09:46:58
537817,538045,Florida International University,21551.0,10001.0,FIU - Maidique Campus & All Online Courses,19956951,FALL 2022,107903977,PHC,107904486,6104C,107916813,RXD,107902492,2022-02-05 09:46:58
537818,538046,Florida International University,21551.0,10001.0,FIU - Maidique Campus & All Online Courses,19956951,FALL 2022,107903977,PHC,107904486,6102,107916814,RXD,107902491,2022-02-05 09:46:58
537819,538047,Florida International University,21551.0,10001.0,FIU - Maidique Campus & All Online Courses,19956951,FALL 2022,107903977,PHC,107904486,6102,107916814,RVC,107902490,2022-02-05 09:46:58


In [11]:
# Checking shape and dtypes of data

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537821 entries, 0 to 537820
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     537821 non-null  int64  
 1   university     537821 non-null  object 
 2   store_id       380343 non-null  float64
 3   catalog_id     55691 non-null   float64
 4   campus         516255 non-null  object 
 5   campus_id      537821 non-null  object 
 6   term           537821 non-null  object 
 7   term_id        537821 non-null  object 
 8   department     537821 non-null  object 
 9   department_id  537821 non-null  object 
 10  course         537821 non-null  object 
 11  course_id      537821 non-null  object 
 12  section        537821 non-null  object 
 13  section_id     537821 non-null  object 
 14  scanDate       537821 non-null  object 
dtypes: float64(2), int64(1), object(12)
memory usage: 61.5+ MB


In [12]:
# Eliminate dupes and drop unneeded columns

df1 = df1.drop_duplicates()

df1 = df1.drop(columns=["Unnamed: 0", "store_id", "catalog_id", "campus", "campus_id", "term_id", "scanDate"])

In [13]:
# Reorder columns and capitalize data where appropriate

df1 = df1[["department_id", "course_id", "section_id", "university", "term", "department", "course", "section"]]

all_caps(df1, ["university", "term", "department", "course", "section"])

Unnamed: 0,department_id,course_id,section_id,university,term,department,course,section
0,334_1_9,200,1,BALL STATE UNIVERSITY,SPRING 2022,ACC,200,1
1,334_1_9,201,01,BALL STATE UNIVERSITY,SPRING 2022,ACC,201,01
2,334_1_9,201,02,BALL STATE UNIVERSITY,SPRING 2022,ACC,201,02
3,334_1_9,201,03,BALL STATE UNIVERSITY,SPRING 2022,ACC,201,03
4,334_1_9,201,04,BALL STATE UNIVERSITY,SPRING 2022,ACC,201,04
...,...,...,...,...,...,...,...,...
537816,107904486,107916807,107902500,FLORIDA INTERNATIONAL UNIVERSITY,FALL 2022,PHC,6328,RVC
537817,107904486,107916813,107902492,FLORIDA INTERNATIONAL UNIVERSITY,FALL 2022,PHC,6104C,RXD
537818,107904486,107916814,107902491,FLORIDA INTERNATIONAL UNIVERSITY,FALL 2022,PHC,6102,RXD
537819,107904486,107916814,107902490,FLORIDA INTERNATIONAL UNIVERSITY,FALL 2022,PHC,6102,RVC


In [14]:
# Examine value counts on term column to begin consolidation into categories 

vc(df1["term"])

Unnamed: 0,term
SPRING 2022,287183
FALL 2021,67170
FALL 2022,19304
FALL 21,16880
WINTER - 2022,12627
SPRING '22,12417
WINTER 2022,12220
SUMMER 2022,11341
SPRING - 2022,10979
FALL SEMESTER 2021,9352


In [15]:
# Use boolean mask to consolidate Spring 2022 term data 

spring_mask = df1["term"].str.contains("SPRING")

df1.loc[spring_mask, "term"] = "SPRING"


fall_mask = df1["term"].str.contains("FALL")

df1.loc[fall_mask, "term"] = "FALL"


summer_mask = df1["term"].str.contains("SUMMER")

df1.loc[summer_mask, "term"] = "SUMMER"


winter_mask = df1["term"].str.contains("WINTER")

df1.loc[winter_mask, "term"] = "WINTER"



junk_mask = ~(df1["term"].isin(["SPRING", "FALL", "SUMMER", "WINTER"]))

df1.loc[junk_mask, "term"] = "JUNK"

df1 = df1.loc[~(df1["term"]=="JUNK"), :]


vc(df1["term"])

Unnamed: 0,term
SPRING,366147
FALL,114891
WINTER,27441
SUMMER,15489


In [16]:
df1 = df1.reset_index(drop=True)

In [17]:
courses_df = df1.copy()

In [18]:
courses_df

Unnamed: 0,department_id,course_id,section_id,university,term,department,course,section
0,334_1_9,200,1,BALL STATE UNIVERSITY,SPRING,ACC,200,1
1,334_1_9,201,01,BALL STATE UNIVERSITY,SPRING,ACC,201,01
2,334_1_9,201,02,BALL STATE UNIVERSITY,SPRING,ACC,201,02
3,334_1_9,201,03,BALL STATE UNIVERSITY,SPRING,ACC,201,03
4,334_1_9,201,04,BALL STATE UNIVERSITY,SPRING,ACC,201,04
...,...,...,...,...,...,...,...,...
523963,107904486,107916807,107902500,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6328,RVC
523964,107904486,107916813,107902492,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6104C,RXD
523965,107904486,107916814,107902491,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6102,RXD
523966,107904486,107916814,107902490,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6102,RVC


# Import textbooks_#.csv files into dataframe, clean and transform data

### *textbooks_df*

In [19]:
# Read CSV files from List

csvs_files = ["Resources\\textbooks_1.csv", "Resources\\textbooks_2.csv", "Resources\\textbooks_3.csv",\
              "Resources\\textbooks_4.csv", "Resources\\textbooks_5.csv"]

course_csv_list = (pd.read_csv(file) for file in csvs_files)

df2 = pd.concat(course_csv_list, ignore_index=True)

In [20]:
df2

Unnamed: 0.1,Unnamed: 0,store_id,catalog_id,campus_id,term_id,department_id,course_id,section_id,title,edition,publisher,book_type,book_url,book_id,no_textbook_message,recommend_type,ISBN,price,scanDate
0,0,9301.0,,334,334_1_22_W,334_1_9,200,1,Cengage Unlimited - Access (1 Semester),18,CENGAGE U,Digital Purchase,https://bsu.bncollege.com/c/Cengage-Unlimited-...,MBS_5186058_dg,,eBook Option,9.780358e+12,$119.99,2022-02-05 09:46:58
1,1,9301.0,,334,334_1_22_W,334_1_9,201,01,Financial Accounting: Information for Decision...,10TH 21,MCG CUSTOM,New Print,https://bsu.bncollege.com/c/Financial-Accounti...,MBS_5780024_new,,REQUIRED,9.781264e+12,$107.90,2022-02-05 09:46:58
2,2,9301.0,,334,334_1_22_W,334_1_9,201,02,Financial Accounting: Information for Decision...,10TH 21,MCG CUSTOM,New Print,https://bsu.bncollege.com/c/Financial-Accounti...,MBS_5780024_new,,REQUIRED,9.781264e+12,$107.90,2022-02-05 09:46:58
3,3,9301.0,,334,334_1_22_W,334_1_9,201,03,Financial Accounting: Information for Decision...,10TH 21,MCG CUSTOM,New Print,https://bsu.bncollege.com/c/Financial-Accounti...,MBS_5780024_new,,REQUIRED,9.781264e+12,$107.90,2022-02-05 09:46:58
4,4,9301.0,,334,334_1_22_W,334_1_9,201,04,Financial Accounting: Information for Decision...,10TH 21,MCG CUSTOM,New Print,https://bsu.bncollege.com/c/Financial-Accounti...,MBS_5780024_new,,REQUIRED,9.781264e+12,$107.90,2022-02-05 09:46:58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990722,992670,88196.0,10001.0,75156912,106477925,106757452,106762860,106737761,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,550036485535.0,,REQUIRED,9.781646e+12,$45.60,2022-02-05 09:46:58
990723,992671,88196.0,10001.0,75156912,106477925,106757452,106762860,106737762,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,550036485535.0,,REQUIRED,9.781646e+12,$45.60,2022-02-05 09:46:58
990724,992672,88196.0,10001.0,75156912,106477925,106757452,106762860,106737763,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,550036485535.0,,REQUIRED,9.781646e+12,$45.60,2022-02-05 09:46:58
990725,992673,88196.0,10001.0,75156912,106477925,106757452,106762860,106737764,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,550036485535.0,,REQUIRED,9.781646e+12,$45.60,2022-02-05 09:46:58


In [21]:
# Checking shape and dtypes of data

df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990727 entries, 0 to 990726
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           990727 non-null  int64  
 1   store_id             669440 non-null  float64
 2   catalog_id           121358 non-null  float64
 3   campus_id            990727 non-null  object 
 4   term_id              990727 non-null  object 
 5   department_id        990727 non-null  object 
 6   course_id            990727 non-null  object 
 7   section_id           990727 non-null  object 
 8   title                645255 non-null  object 
 9   edition              633950 non-null  object 
 10  publisher            644969 non-null  object 
 11  book_type            645255 non-null  object 
 12  book_url             645255 non-null  object 
 13  book_id              645255 non-null  object 
 14  no_textbook_message  345472 non-null  object 
 15  recommend_type   

In [22]:
# Eliminate dupes and drop unneeded columns

df2 = df2.drop_duplicates()

df2 = df2.drop(columns=\
    ["Unnamed: 0", "store_id", "catalog_id", "campus_id", "term_id", "book_id",\
     "no_textbook_message", "recommend_type", "scanDate"])

In [23]:
# Reorder columns and capitalize data where appropriate

all_caps(df2, ["title", "edition", "publisher", "book_type", "price"])

Unnamed: 0,department_id,course_id,section_id,title,edition,publisher,book_type,book_url,ISBN,price
0,334_1_9,200,1,CENGAGE UNLIMITED - ACCESS (1 SEMESTER),18,CENGAGE U,DIGITAL PURCHASE,https://bsu.bncollege.com/c/Cengage-Unlimited-...,9.780358e+12,$119.99
1,334_1_9,201,01,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9.781264e+12,$107.90
2,334_1_9,201,02,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9.781264e+12,$107.90
3,334_1_9,201,03,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9.781264e+12,$107.90
4,334_1_9,201,04,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9.781264e+12,$107.90
...,...,...,...,...,...,...,...,...,...,...
990722,106757452,106762860,106737761,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,9.781646e+12,$45.60
990723,106757452,106762860,106737762,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,9.781646e+12,$45.60
990724,106757452,106762860,106737763,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,9.781646e+12,$45.60
990725,106757452,106762860,106737764,BIOL 252 LAB PRINTED ACCESS CODE,W21,VAN-GRINER,BUY NEW,https://unc.bncollege.com/shop/BNCB_TextbookDe...,9.781646e+12,$45.60


In [24]:
# Drop any rows where title, ISBN, and price are ALL missing (i.e., not much use for analysis)

df2 = df2.dropna(axis=0, subset=["title", "ISBN", "price"], how="all")

In [25]:
# Fill in missing values with default values  

df2 = df2.fillna(value={"edition":"unknown", "publisher":"unknown", "ISBN": 0.0, "price":"0.01"})

In [26]:
# Rename ISBN column to isbn 

df2 = df2.rename(columns={"ISBN":"isbn"})

In [27]:
# Transform isbn data

df2["isbn"] = clean_isbns(df2["isbn"])

In [28]:
# Transform price data

df2["price"] = clean_prices(df2["price"])

In [29]:
# Take a random sample of our dataframe to see if transformations are effective

df2.sample(50)

Unnamed: 0,department_id,course_id,section_id,title,edition,publisher,book_type,book_url,isbn,price
57160,480_1_198,742,B1,ASSOCIATED PRESS STYLEBOOK AND BRIEFING ON MED...,20,HACHETTE B,USED PRINT,https://bu.bncollege.com/Textbooks/Associated-...,9781541647572,18.74
434207,8159_1_548,380,001,ANALYZING SOCIAL NETWORKS,2ND 18,SAGE,NEW PRINT,https://uky.bncollege.com/c/Analyzing-Social-N...,9781526404107,54.0
279404,631_1_61,340,21223,HUMAN RESOURCE MANAGEMENT (LOOSELEAF),12TH 21,MCG,NEW PRINT,https://iupui.bncollege.com/c/Human-Resource-M...,9781260780765,155.8
522339,572_1_303,607,600,INTRO. TO POLYMERS,3RD 11,TAYLOR,NEW PRINT,https://tamu.bncollege.com/c/Intro-to-Polymers...,9780849339295,69.95
331191,330_1_450,843,D06,PUBLICATION MANUAL OF THE AMERICAN PSYCHOLOGIC...,7TH 20,AM PSYCHOL,USED PRINT,https://liberty.bncollege.com/c/Publication-Ma...,9781433832161,22.06
116536,656_1_805,6015,71249,PUBLICATION MANUAL OF THE AMERICAN PSYCHOLOGIC...,7TH 20,AM PSYCHOL,DIGITAL PURCHASE,https://northeastern.bncollege.com/c/Publicati...,9781433832161,35.99
739687,8085_1_85,4035,900(9420),ASSESSMENT AND TREATMENT OF SPEECH - WITH CD,3RD 15,PRO-ED,USED PRINT,https://unt.bncollege.com/c/Assessment-and-Tre...,9781416405801,101.78
223298,244_1_94,4150,13373,CLINICAL HEMATOLOGY ATLAS,5TH 17,ELSEVIER,USED PRINT RENTAL,https://oakland.bncollege.com/Textbooks/Clinic...,9780323322492,38.4
327699,330_1_404,334,001,FUNDAMENTALS OF DIFFERENTIAL EQUATIONS,9TH 18,PEARSON,USED PRINT,https://liberty.bncollege.com/c/Fundamentals-o...,9780321977069,147.92
614883,435_1_806,478,001,PRINCIPLES OF NEUROPSYCHOLOGY,2ND 08,CENGAGE L,USED PRINT RENTAL,https://psu.bncollege.com/c/Principles-of-Neur...,9780495003762,148.75


In [30]:
df2 = df2.reset_index(drop=True)

In [31]:
textbooks_df = df2.copy()

In [32]:
textbooks_df.head(5)

Unnamed: 0,department_id,course_id,section_id,title,edition,publisher,book_type,book_url,isbn,price
0,334_1_9,200,1,CENGAGE UNLIMITED - ACCESS (1 SEMESTER),18,CENGAGE U,DIGITAL PURCHASE,https://bsu.bncollege.com/c/Cengage-Unlimited-...,9780357700006,119.99
1,334_1_9,201,1,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.9
2,334_1_9,201,2,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.9
3,334_1_9,201,3,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.9
4,334_1_9,201,4,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.9


In [33]:
textbooks_df.sample(50)

Unnamed: 0,department_id,course_id,section_id,title,edition,publisher,book_type,book_url,isbn,price
303797,317_1_23,162,SECTION 1,LIFE: THE SCIENCE OF BIOLOGY - LAUNCHPAD ACCESS,12TH 20,MAC HIGHER,NEW PRINT,https://ole-miss.bncollege.com/c/Life-The-Scie...,9781319284435,137.14
468085,8085_1_10,4320,001(12215),TAXATION OF INDIVIDUALS AND BUSINESS ENTITIES ...,13TH 22,MCG CUSTOM,NEW PRINT,https://unt.bncollege.com/c/Taxation-of-Indivi...,9781266860072,150.0
553795,8085_1_670,1400,004(13757),MORALITY AND MORAL CONTROVERSIES: READINGS IN ...,10TH 19,TAYLOR,USED PRINT RENTAL,https://unt.bncollege.com/c/Morality-and-Moral...,9780415789318,79.05
241260,8152_1_660,13022,012,REAL TIME PHYSICS - ACTIVITY LEARNING LABORATO...,12,WILEY,NEW PRINT,https://kent.bncollege.com/c/Real-Time-Physics...,9780470768884,36.06
625544,105160883,105171233,105152121,CREATING SIGNIFICANT LEARN.EXPERIENCES,(REV)13,WILEY,RENT USED,https://fiu.bncollege.com/shop/BNCB_TextbookDe...,9781118124253,21.4
366535,572_1_426,480,900,FUTURE OF AN ILLUSION,61,NORTON,NEW PRINT,https://tamu.bncollege.com/Textbooks/Future-of...,9780393008319,13.95
158980,336_1_285,5273,22958,"SCALE: THE UNIVERSAL LAWS OF GROWTH, INNOVATIO...",17,PENG RAND,USED PRINT RENTAL,https://uco.bncollege.com/c/Scale-The-Universa...,9781594205583,13.25
507028,583_583_210,102,061,"EASY WRITER, 2020 APA UPDATED",7TH 20,MPS PUB,DIGITAL RENTAL,https://wvu.bncollege.com/c/Easy-Writer-2020-A...,8220122863457,19.99
462995,620_1_235,483,901,YELLOW WALLPAPER AND OTHER WRITING,89,PENG RAND,USED PRINT RENTAL,https://vcu.bncollege.com//c/Yellow-Wallpaper-...,9780553213751,3.05
469579,327_1_31,2160,0W61,SKEPTIC'S GUIDE TO SPORTS SCIENCE: CONFRONTING...,20,TAYLOR,NEW PRINT,https://ucf.bncollege.com/c/Skeptics-Guide-to-...,9781138333123,160.0


# Merge courses and textbooks dataframes with inner join

### *merge_courses_and_textbooks_df*

In [34]:
# Inner merge on both dataframes on common id fields

merge_df = pd.merge(courses_df, textbooks_df, on=["department_id", "course_id", "section_id"], how="inner")

In [35]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 610343 entries, 0 to 610342
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   department_id  610343 non-null  object 
 1   course_id      610343 non-null  object 
 2   section_id     610343 non-null  object 
 3   university     610343 non-null  object 
 4   term           610343 non-null  object 
 5   department     610343 non-null  object 
 6   course         610343 non-null  object 
 7   section        610343 non-null  object 
 8   title          610343 non-null  object 
 9   edition        610343 non-null  object 
 10  publisher      610343 non-null  object 
 11  book_type      610343 non-null  object 
 12  book_url       610343 non-null  object 
 13  isbn           610343 non-null  object 
 14  price          610343 non-null  float64
dtypes: float64(1), object(14)
memory usage: 74.5+ MB


In [36]:
merge_df.drop_duplicates()

Unnamed: 0,department_id,course_id,section_id,university,term,department,course,section,title,edition,publisher,book_type,book_url,isbn,price
0,334_1_9,200,1,BALL STATE UNIVERSITY,SPRING,ACC,200,1,CENGAGE UNLIMITED - ACCESS (1 SEMESTER),18,CENGAGE U,DIGITAL PURCHASE,https://bsu.bncollege.com/c/Cengage-Unlimited-...,9780357700006,119.99
1,334_1_9,201,01,BALL STATE UNIVERSITY,SPRING,ACC,201,01,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.90
2,334_1_9,201,02,BALL STATE UNIVERSITY,SPRING,ACC,201,02,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.90
3,334_1_9,201,03,BALL STATE UNIVERSITY,SPRING,ACC,201,03,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.90
4,334_1_9,201,04,BALL STATE UNIVERSITY,SPRING,ACC,201,04,FINANCIAL ACCOUNTING: INFORMATION FOR DECISION...,10TH 21,MCG CUSTOM,NEW PRINT,https://bsu.bncollege.com/c/Financial-Accounti...,9781264273881,107.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610338,107904486,107916708,107902516,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6530,U01,KOTCH'S MATERNAL+CHILD HEALTH,4TH 22,JONES+BART,BUY DIGITAL,https://fiu.bncollege.com/shop/BNCB_TextbookDe...,9781284200256,59.12
610339,107904486,107916805,107902503,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6374,RXC,HAZARD MITIGATION+PREPAREDNESS,2ND 17,TAYLOR,BUY USED,https://fiu.bncollege.com/shop/fiu/textbook/ha...,9781466595569,90.00
610340,107904486,107916805,107902503,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6374,RXC,HAZARD MITIGATION+PREPAREDNESS,2ND 17,TAYLOR,BUY NEW,https://fiu.bncollege.com/shop/fiu/textbook/ha...,9781466595569,120.00
610341,107904486,107916805,107902504,FLORIDA INTERNATIONAL UNIVERSITY,FALL,PHC,6374,RX02,HAZARD MITIGATION+PREPAREDNESS,2ND 17,TAYLOR,BUY USED,https://fiu.bncollege.com/shop/fiu/textbook/ha...,9781466595569,90.00


In [37]:
merge_df = merge_df[["university", "term", "department", "course", "section", "title", "edition", "price",\
                     "isbn", "publisher", "book_type", "book_url"]]

In [38]:
merge_df.sample(50)

Unnamed: 0,university,term,department,course,section,title,edition,price,isbn,publisher,book_type,book_url
487836,GEORGE MASON UNIVERSITY,SPRING,ENGH,202,DL3,EXIT WEST,18,8.4,9780735212206,PENG RAND,USED PRINT RENTAL,https://gmu.bncollege.com//c/Exit-West/p/MBS_2...
204422,KENT STATE UNIVERSITY KENT CAMPUS,SPRING,NURS,60403,005,PRACTICE OF NURSING RESEARCH,8TH 17,48.5,9780323377584,ELSEVIER,USED PRINT RENTAL,https://kent.bncollege.com/c/Practice-of-Nursi...
133004,LOUISIANA STATE UNIVERSITY,SPRING,CMST,1061,007,FUNDAMENTAL COMMUNICATION SKILLS,3RD 20,31.45,9781792429279,K/H,USED PRINT,https://lsu.bncollege.com/c/Fundamental-Commun...
84045,INDIANA UNIVERSITY BLOOMINGTON,FALL,CHEM-S,342,12925,ORGANIC CHEMISTRY: STRUCTURE AND FUNCTION,8TH 18,285.06,9781319079451,MAC HIGHER,USED PRINT,https://iub.bncollege.com/c/Organic-Chemistry-...
53293,BOSTON UNIVERSITY,SPRING,CGS HU,202,UU,CREATING CAPABILITIES,11,13.8,978067407235,TRILITERAL,NEW PRINT RENTAL,https://bu.bncollege.com/c/Creating-Capabiliti...
224332,PENN STATE UNIVERSITY PARK,SPRING,ENGL,431,001,BLUEST EYE,(REV)07,14.95,9780307278449,PENG RAND,NEW PRINT,https://psu.bncollege.com/c/Bluest-Eye/p/MBS_8...
155265,MISSISSIPPI STATE UNIVERSITY,SPRING,FDM,3553,001,MOS 2013 STUDY GUIDE - EXAMINATION 77-420,13,19.99,9780735669208,PEARSON,NEW PRINT,https://msstate.bncollege.com/c/MOS-2013-Study...
388949,UNIVERSITY OF MARYLAND COLLEGE PARK,SPRING,ENCH,630,0101,ANALYSIS OF TRANSPORT PHENOMENA,2ND 12,146.2,9780199740284,OXF,NEW PRINT RENTAL,https://umcp.bncollege.com/c/Analysis-of-Trans...
56514,BOSTON UNIVERSITY,SPRING,LAW JD,934,A1,PRIVACY IN TECHNOLOGY,14,75.0,9780988552562,IAPP,NEW PRINT,https://bu.bncollege.com/c/Privacy-in-Technolo...
84510,INDIANA UNIVERSITY BLOOMINGTON,FALL,CMLT-C,611,42647,PARADISE LOST,(REV)05,12.35,9780872207332,HACKETT,NEW PRINT,https://iub.bncollege.com/c/Paradise-Lost/p/MB...


In [39]:
merge_df = merge_df.reset_index(drop=True)

In [40]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 610343 entries, 0 to 610342
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   university  610343 non-null  object 
 1   term        610343 non-null  object 
 2   department  610343 non-null  object 
 3   course      610343 non-null  object 
 4   section     610343 non-null  object 
 5   title       610343 non-null  object 
 6   edition     610343 non-null  object 
 7   price       610343 non-null  float64
 8   isbn        610343 non-null  object 
 9   publisher   610343 non-null  object 
 10  book_type   610343 non-null  object 
 11  book_url    610343 non-null  object 
dtypes: float64(1), object(11)
memory usage: 55.9+ MB


#### Connect to engine

# Import Security 
from config.py import user

In [49]:
inspector = inspect(engine)
inspector.get_table_names()

['course_textbooks']

In [43]:
protocol = "postgresql"
user = "postgres"
password = "bootcamp"
host = "localhost"
port = "5432"
db = "library_db"
conn_string = f'{protocol}://{user}:{password}@{host}:{port}/{db}'
engine = create_engine(conn_string)

In [47]:
course_textbooks.to_sql(name='course_textbooks', con=engine, if_exists='append', index=False)

343