In [1]:
### Loading Libraries 
import pandas as pd
import matplotlib as plt
import numpy as np
import seaborn as sns
import datetime as dt
pd.set_option('display.max_columns', 101)

### Preliminary Data Handling

In [2]:
### Loading in data from Excel file
xls = pd.ExcelFile('Scholastic_AssociateDataScience_CaseStudy_Data.xlsx')
### Loading the seperate sheets of the excel file
mag_df = pd.read_excel(xls, 1)
buil_df = pd.read_excel(xls, 2)
club_df = pd.read_excel(xls, 3)
ed_df = pd.read_excel(xls, 4)

In [3]:
### Checking dimensions of sheets
print(mag_df.shape)
print(buil_df.shape)
print(club_df.shape)
print(ed_df.shape)

(101136, 5)
(51793, 20)
(140349, 3)
(94470, 3)


#### All NA's are from the building dataframe.  Majority of these NA's come from the reading performance and the b_pct columns

In [4]:
### Checking NA's
print(mag_df.isnull().sum())
print(buil_df.isnull().sum())
print(club_df.isnull().sum())
print(ed_df.isnull().sum())

school_year      0
building_id      0
teacher_count    0
paid_quantity    0
order_amount     0
dtype: int64
building_id                         0
mailing_state                       1
enrollment                          0
b_indicator_charter                 0
b_indicator_catholic                0
b_indicator_private                 0
b_indicator_public                  0
reading_performance_grd_3_pct    4672
reading_performance_grd_4_pct    4672
reading_performance_grd_5_pct    4672
reading_performance_grd_6_pct    4672
reading_performance_grd_7_pct    4672
reading_performance_grd_8_pct    4672
b_pct_aa                         4672
b_pct_as                         4672
b_pct_wa                         4672
b_pct_ha                         4672
b_pct_t1                         4672
b_census_hhi                      577
b_census_urbanicity               523
dtype: int64
building_id     0
school_year     0
bookclub_rev    0
dtype: int64
building_id         0
school_year         0
revenue_

In [5]:
### Joining magazine data sheets to sheet with building information
df = mag_df.merge(buil_df,on='building_id', how = 'left')

In [6]:
### Joining magazine and building data with other revenue data by building id and school_year
df = df.merge(club_df, on = ['building_id', 'school_year'], how = 'left').merge(ed_df, on = ['building_id','school_year'], how = 'left')

In [7]:
### Checking dimensions to ensure proper joins of final dataset
print(df.shape)

(101136, 26)


### Exploratory Data Analysis

In [8]:
### Checking NA's, majority of NA's in revenue_ed_group around ~1/3 data missing
print(df.isnull().sum())

school_year                          0
building_id                          0
teacher_count                        0
paid_quantity                        0
order_amount                         0
mailing_state                      818
enrollment                         818
b_indicator_charter                818
b_indicator_catholic               818
b_indicator_private                818
b_indicator_public                 818
reading_performance_grd_3_pct     3069
reading_performance_grd_4_pct     3069
reading_performance_grd_5_pct     3069
reading_performance_grd_6_pct     3069
reading_performance_grd_7_pct     3069
reading_performance_grd_8_pct     3069
b_pct_aa                          3069
b_pct_as                          3069
b_pct_wa                          3069
b_pct_ha                          3069
b_pct_t1                          3069
b_census_hhi                      1643
b_census_urbanicity               1573
bookclub_rev                      2144
revenue_ed_group         

#### Exploring the data to find characteristics of buildings that correspond to the most paid quantity decrease (Loss).

In [9]:
### Creating change in paid quantity variable
df['chg_pd_qty'] = df.groupby(['building_id']).paid_quantity.diff().fillna(0)

In [10]:
### Checking new variable
for key, item in df.groupby(['building_id']):
    print(df[['building_id','school_year','chg_pd_qty','paid_quantity']].groupby(['building_id']).get_group(key), "\n\n")

       building_id  school_year  chg_pd_qty  paid_quantity
0        600030153         2017         0.0             61
34157    600030153         2018        40.0            101
67932    600030153         2019       -45.0             56 


       building_id  school_year  chg_pd_qty  paid_quantity
1        600030160         2017         0.0            145
34158    600030160         2018        79.0            224
67933    600030160         2019        80.0            304 


       building_id  school_year  chg_pd_qty  paid_quantity
67934    600030173         2019         0.0             14 


       building_id  school_year  chg_pd_qty  paid_quantity
2        600030189         2017         0.0             68
34159    600030189         2018       -34.0             34
67935    600030189         2019        -2.0             32 


       building_id  school_year  chg_pd_qty  paid_quantity
3        600030227         2017         0.0             88
34160    600030227         2018        17.0 

       building_id  school_year  chg_pd_qty  paid_quantity
35       600030412         2017         0.0            100
34191    600030412         2018       -10.0             90
67968    600030412         2019        20.0            110 


       building_id  school_year  chg_pd_qty  paid_quantity
36       600030413         2017         0.0            126
34192    600030413         2018       -51.0             75
67969    600030413         2019        20.0             95 


       building_id  school_year  chg_pd_qty  paid_quantity
37       600030416         2017         0.0            472
34193    600030416         2018       -26.0            446 


       building_id  school_year  chg_pd_qty  paid_quantity
38       600030418         2017         0.0             40
34194    600030418         2018         2.0             42 


       building_id  school_year  chg_pd_qty  paid_quantity
39       600030419         2017         0.0             92
34195    600030419         2018        -3.0 

       building_id  school_year  chg_pd_qty  paid_quantity
71       600030601         2017         0.0            169
34223    600030601         2018       -49.0            120
67999    600030601         2019         0.0            120 


       building_id  school_year  chg_pd_qty  paid_quantity
72       600030603         2017         0.0            121
34224    600030603         2018        19.0            140
68000    600030603         2019        10.0            150 


       building_id  school_year  chg_pd_qty  paid_quantity
73       600030605         2017         0.0             53
34225    600030605         2018       361.0            414
68001    600030605         2019       -14.0            400 


       building_id  school_year  chg_pd_qty  paid_quantity
74       600030607         2017         0.0            560
34226    600030607         2018       -91.0            469
68002    600030607         2019       -84.0            385 


       building_id  school_year  chg_pd_qty 

       building_id  school_year  chg_pd_qty  paid_quantity
107      600030737         2017         0.0            250
34257    600030737         2018      -180.0             70
68031    600030737         2019        51.0            121 


       building_id  school_year  chg_pd_qty  paid_quantity
108      600030738         2017         0.0             85
34258    600030738         2018       -75.0             10 


       building_id  school_year  chg_pd_qty  paid_quantity
109      600030740         2017         0.0            323
34259    600030740         2018        72.0            395
68032    600030740         2019        68.0            463 


     building_id  school_year  chg_pd_qty  paid_quantity
110    600030745         2017         0.0            690 


       building_id  school_year  chg_pd_qty  paid_quantity
111      600030749         2017         0.0            551
34260    600030749         2018      -206.0            345
68033    600030749         2019        20.0     

       building_id  school_year  chg_pd_qty  paid_quantity
144      600030852         2017         0.0             40
34289    600030852         2018        20.0             60
68062    600030852         2019        41.0            101 


       building_id  school_year  chg_pd_qty  paid_quantity
145      600030853         2017         0.0            165
34290    600030853         2018       -21.0            144
68063    600030853         2019        68.0            212 


       building_id  school_year  chg_pd_qty  paid_quantity
34291    600030862         2018         0.0             16
68064    600030862         2019         4.0             20 


       building_id  school_year  chg_pd_qty  paid_quantity
146      600030873         2017         0.0            122
34292    600030873         2018       -48.0             74
68065    600030873         2019       -37.0             37 


       building_id  school_year  chg_pd_qty  paid_quantity
147      600030876         2017         0.0 

       building_id  school_year  chg_pd_qty  paid_quantity
175      600031000         2017         0.0            688
34320    600031000         2018       -24.0            664
68102    600031000         2019      -136.0            528 


       building_id  school_year  chg_pd_qty  paid_quantity
176      600031001         2017         0.0            900
34321    600031001         2018        10.0            910
68103    600031001         2019        10.0            920 


     building_id  school_year  chg_pd_qty  paid_quantity
177    600031003         2017         0.0             20 


       building_id  school_year  chg_pd_qty  paid_quantity
34322    600031019         2018         0.0             21
68104    600031019         2019       374.0            395 


       building_id  school_year  chg_pd_qty  paid_quantity
34323    600031022         2018         0.0             35 


     building_id  school_year  chg_pd_qty  paid_quantity
178    600031025         2017         0.0      

       building_id  school_year  chg_pd_qty  paid_quantity
208      600031151         2017         0.0            175
34353    600031151         2018       -25.0            150
68136    600031151         2019        30.0            180 


       building_id  school_year  chg_pd_qty  paid_quantity
209      600031156         2017         0.0            100
34354    600031156         2018        56.0            156 


       building_id  school_year  chg_pd_qty  paid_quantity
68137    600031159         2019         0.0             40 


       building_id  school_year  chg_pd_qty  paid_quantity
210      600031164         2017         0.0             25
34355    600031164         2018         0.0             25 


       building_id  school_year  chg_pd_qty  paid_quantity
211      600031166         2017         0.0             74
34356    600031166         2018        86.0            160
68138    600031166         2019         0.0            160 


       building_id  school_year  chg_pd_q

     building_id  school_year  chg_pd_qty  paid_quantity
238    600031300         2017         0.0            110 


     building_id  school_year  chg_pd_qty  paid_quantity
239    600031302         2017         0.0             76 


       building_id  school_year  chg_pd_qty  paid_quantity
34392    600031303         2018         0.0             72 


       building_id  school_year  chg_pd_qty  paid_quantity
240      600031304         2017         0.0            573
34393    600031304         2018      -178.0            395
68167    600031304         2019      -135.0            260 


       building_id  school_year  chg_pd_qty  paid_quantity
241      600031305         2017         0.0            104
34394    600031305         2018       -80.0             24 


       building_id  school_year  chg_pd_qty  paid_quantity
242      600031306         2017         0.0            195
34395    600031306         2018       -20.0            175
68168    600031306         2019        60.0      

       building_id  school_year  chg_pd_qty  paid_quantity
34428    600031457         2018         0.0             13 


       building_id  school_year  chg_pd_qty  paid_quantity
270      600031458         2017         0.0            324
34429    600031458         2018       -27.0            297
68200    600031458         2019        -8.0            289 


       building_id  school_year  chg_pd_qty  paid_quantity
271      600031460         2017         0.0            110
34430    600031460         2018        50.0            160
68201    600031460         2019       -80.0             80 


       building_id  school_year  chg_pd_qty  paid_quantity
272      600031461         2017         0.0            266
34431    600031461         2018      -126.0            140
68202    600031461         2019        -2.0            138 


       building_id  school_year  chg_pd_qty  paid_quantity
273      600031462         2017         0.0            786
34432    600031462         2018      -109.0 

       building_id  school_year  chg_pd_qty  paid_quantity
305      600031614         2017         0.0            198
34462    600031614         2018       -89.0            109
68229    600031614         2019        30.0            139 


       building_id  school_year  chg_pd_qty  paid_quantity
306      600031616         2017         0.0            320
34463    600031616         2018        -9.0            311
68230    600031616         2019       -72.0            239 


       building_id  school_year  chg_pd_qty  paid_quantity
307      600031617         2017         0.0            830
34464    600031617         2018       -25.0            805
68231    600031617         2019      -115.0            690 


       building_id  school_year  chg_pd_qty  paid_quantity
308      600031623         2017         0.0            415
34465    600031623         2018       -46.0            369
68232    600031623         2019        38.0            407 


       building_id  school_year  chg_pd_qty 

       building_id  school_year  chg_pd_qty  paid_quantity
336      600031937         2017         0.0            542
34499    600031937         2018       -21.0            521
68263    600031937         2019      -439.0             82 


       building_id  school_year  chg_pd_qty  paid_quantity
337      600031940         2017         0.0            610
34500    600031940         2018       -59.0            551
68264    600031940         2019         7.0            558 


       building_id  school_year  chg_pd_qty  paid_quantity
34501    600031941         2018         0.0            134
68265    600031941         2019       114.0            248 


       building_id  school_year  chg_pd_qty  paid_quantity
338      600031942         2017         0.0             45
34502    600031942         2018       -25.0             20
68266    600031942         2019         0.0             20 


       building_id  school_year  chg_pd_qty  paid_quantity
68267    600031943         2019         0.0 

       building_id  school_year  chg_pd_qty  paid_quantity
374      600032094         2017         0.0              2
34530    600032094         2018        28.0             30
68299    600032094         2019         5.0             35 


       building_id  school_year  chg_pd_qty  paid_quantity
375      600032095         2017         0.0             74
34531    600032095         2018       -49.0             25
68300    600032095         2019        45.0             70 


       building_id  school_year  chg_pd_qty  paid_quantity
376      600032097         2017         0.0             40
34532    600032097         2018        24.0             64 


       building_id  school_year  chg_pd_qty  paid_quantity
377      600032099         2017         0.0            450
34533    600032099         2018      -150.0            300
68301    600032099         2019      -124.0            176 


       building_id  school_year  chg_pd_qty  paid_quantity
378      600032104         2017         0.0 

       building_id  school_year  chg_pd_qty  paid_quantity
413      600032179         2017         0.0            239
34563    600032179         2018       -29.0            210
68326    600032179         2019         6.0            216 


       building_id  school_year  chg_pd_qty  paid_quantity
68327    600032180         2019         0.0             10 


     building_id  school_year  chg_pd_qty  paid_quantity
414    600032181         2017         0.0             40 


       building_id  school_year  chg_pd_qty  paid_quantity
415      600032185         2017         0.0             48
34564    600032185         2018       -13.0             35 


       building_id  school_year  chg_pd_qty  paid_quantity
416      600032186         2017         0.0            257
34565    600032186         2018      -178.0             79
68328    600032186         2019       -31.0             48 


       building_id  school_year  chg_pd_qty  paid_quantity
417      600032187         2017         0.0  

       building_id  school_year  chg_pd_qty  paid_quantity
449      600032349         2017         0.0             52
34596    600032349         2018       159.0            211
68356    600032349         2019        38.0            249 


       building_id  school_year  chg_pd_qty  paid_quantity
450      600032352         2017         0.0            188
34597    600032352         2018       -42.0            146 


       building_id  school_year  chg_pd_qty  paid_quantity
451      600032353         2017         0.0            480
34598    600032353         2018       412.0            892
68357    600032353         2019       -87.0            805 


     building_id  school_year  chg_pd_qty  paid_quantity
452    600032355         2017         0.0             69 


       building_id  school_year  chg_pd_qty  paid_quantity
453      600032357         2017         0.0            340
68358    600032357         2019        95.0            435 


       building_id  school_year  chg_pd_qty  

KeyboardInterrupt: 

In [11]:
#Creating an indicator variable with 1 if order quantity decreased or 0 if not
df['indicator_qty_dcr'] = np.where(df['chg_pd_qty']< 0, 1, 0)
df.head()

Unnamed: 0,school_year,building_id,teacher_count,paid_quantity,order_amount,mailing_state,enrollment,b_indicator_charter,b_indicator_catholic,b_indicator_private,b_indicator_public,reading_performance_grd_3_pct,reading_performance_grd_4_pct,reading_performance_grd_5_pct,reading_performance_grd_6_pct,reading_performance_grd_7_pct,reading_performance_grd_8_pct,b_pct_aa,b_pct_as,b_pct_wa,b_pct_ha,b_pct_t1,b_census_hhi,b_census_urbanicity,bookclub_rev,revenue_ed_group,chg_pd_qty,indicator_qty_dcr
0,2017,600030153,1,61,339.95,AL,293.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46433.0,RURAL,3468.5,,0.0,0
1,2017,600030160,5,145,903.25,AL,178.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35726.0,RURAL,3358.0,,0.0,0
2,2017,600030189,4,68,357.0,AL,630.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55560.0,SUBURBAN,3766.0,,0.0,0
3,2017,600030227,1,88,326.92,AL,471.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40817.0,SUBURBAN,1853.0,,0.0,0
4,2017,600030248,1,20,64.9,AL,110.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61652.0,RURAL,978.0,,0.0,0


In [13]:
#Checking for duplicates, No duplicates
df.duplicated().sum()

0

In [26]:
df['year_school'] = pd.to_datetime(df['school_year'], format = '%Y')

In [27]:
# listing data types
df.dtypes

school_year                               int64
building_id                               int64
teacher_count                             int64
paid_quantity                             int64
order_amount                            float64
mailing_state                            object
enrollment                              float64
b_indicator_charter                     float64
b_indicator_catholic                    float64
b_indicator_private                     float64
b_indicator_public                      float64
reading_performance_grd_3_pct           float64
reading_performance_grd_4_pct           float64
reading_performance_grd_5_pct           float64
reading_performance_grd_6_pct           float64
reading_performance_grd_7_pct           float64
reading_performance_grd_8_pct           float64
b_pct_aa                                float64
b_pct_as                                float64
b_pct_wa                                float64
b_pct_ha                                

In [28]:
# Checking value counts of relevant variables
print(df['school_year'].value_counts(), end = '\n\n')# Fairly uniform
print(df['year_school'].value_counts(), end = '\n\n')
print(df['b_census_urbanicity'].value_counts(), end = '\n\n')
print(df['indicator_qty_dcr'].value_counts(), end = '\n\n')

2017    34157
2018    33775
2019    33204
Name: school_year, dtype: int64

2017-01-01    34157
2018-01-01    33775
2019-01-01    33204
Name: year_school, dtype: int64

RURAL       52186
SUBURBAN    34843
URBAN       12534
Name: b_census_urbanicity, dtype: int64

0    73366
1    27770
Name: indicator_qty_dcr, dtype: int64



In [15]:
# Analysing building data with missing revenue_ed
missing_df = df[df['revenue_ed_group'].isna()]


Unnamed: 0,school_year,building_id,teacher_count,paid_quantity,order_amount,mailing_state,enrollment,b_indicator_charter,b_indicator_catholic,b_indicator_private,b_indicator_public,reading_performance_grd_3_pct,reading_performance_grd_4_pct,reading_performance_grd_5_pct,reading_performance_grd_6_pct,reading_performance_grd_7_pct,reading_performance_grd_8_pct,b_pct_aa,b_pct_as,b_pct_wa,b_pct_ha,b_pct_t1,b_census_hhi,b_census_urbanicity,bookclub_rev,revenue_ed_group,chg_pd_qty,indicator_qty_dcr
0,2017,600030153,1,61,339.95,AL,293.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46433.0,RURAL,3468.5,,0.0,0
1,2017,600030160,5,145,903.25,AL,178.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35726.0,RURAL,3358.0,,0.0,0
2,2017,600030189,4,68,357.0,AL,630.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55560.0,SUBURBAN,3766.0,,0.0,0
3,2017,600030227,1,88,326.92,AL,471.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40817.0,SUBURBAN,1853.0,,0.0,0
4,2017,600030248,1,20,64.9,AL,110.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61652.0,RURAL,978.0,,0.0,0


In [None]:
#Check which buildings dont have info for all three years
#Check if the revenue_ed variable is correlated with paid quantity or chg_pd_qty
#Check if 