In [1]:
# Import pandas and statsmodels
import pandas as pd
import statsmodels.api as sm
# Import the data
df = pd.read_csv('Expected_Revenue_-_Historical_Financial_Records.csv')

# Use .head(10) to verify it imported correctly
df.head(10)


Unnamed: 0,Record ID,Name,Facility Type,Average Patient Age,Number of Patients Served per Year,Average Patient Cost,Visitors Allowed After 8PM,Google Review Rating,Average Age of Equipment,Year Established,Emagine Revenue
0,1000,United Hospital Center,Hospital,73,288,1661,1,2.2,15.4,1961,12000
1,1001,T. J. Samson Community Hospital,Hospital,74,483,294,0,2.3,17.7,1965,10500
2,1002,Mount Auburn Hospital,Hospital,48,727,276,1,1.2,16.1,1968,7675
3,1003,Baptist Memorial Hospital of Union County,Hospital,87,269,953,1,2.0,14.9,1972,10157
4,1004,Broward Health Medical Center,Hospital,39,543,746,1,2.7,15.0,1966,9534
5,1005,Adventist Health White Memorial,Hospital,61,1041,290,1,3.6,14.4,1982,13762
6,1006,Maple Grove Hospital,Hospital,68,418,2133,0,1.5,7.9,1983,8327
7,1007,CHI St. Vincent Hot Springs,Hospital,72,1330,1390,0,2.4,12.2,1967,13158
8,1008,Bayfront Health St. Petersburg,Hospital,24,522,3141,1,2.9,8.9,1994,12702
9,1009,Manchester Memorial Hospital,Hospital,70,556,2809,1,4.3,7.7,1996,18607


In [10]:
# Check the varaible types to ensure regression will work
# As a reminder, you want all independent variable types to be integers or floats
df.dtypes
df.astype({'Google Review Rating' : 'int64','Average Age of Equipment' : 'int64'}).dtypes

Record ID                              int64
Name                                  object
Facility Type                         object
Average Patient Age                    int64
Number of Patients Served per Year     int64
Average Patient Cost                   int64
Visitors Allowed After 8PM             int64
Google Review Rating                   int64
Average Age of Equipment               int64
Year Established                       int64
Emagine Revenue                        int64
dtype: object

In [11]:
# Create X and y, start with all possible indepedent variables.
# Do not include Record ID, Name, or Facility Type in X - they are not possible predictors
X = df[['Average Patient Age','Number of Patients Served per Year','Average Patient Cost','Visitors Allowed After 8PM','Google Review Rating','Average Age of Equipment','Year Established']]
y = df['Emagine Revenue']


In [12]:
# Add constant to X for the regression
X = sm.add_constant(X)
# Fit the regression
model = sm.OLS(y,X).fit()

# Print the model summary
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:        Emagine Revenue   R-squared:                       0.779
Model:                            OLS   Adj. R-squared:                  0.776
Method:                 Least Squares   F-statistic:                     304.6
Date:                Fri, 09 Dec 2022   Prob (F-statistic):          1.26e-193
Time:                        00:12:57   Log-Likelihood:                -5608.2
No. Observations:                 613   AIC:                         1.123e+04
Df Residuals:                     605   BIC:                         1.127e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
cons

# Answer the given questions

#Q1: Machine learning requires all the variables to be numerical values before fitting and evaluating a model.
#Q2: I think Number of Patients Served per year, Average Patient Cost and Google Review Rating will be significant in influencing the revenue. The more number of patients served, the higher the average patient cost and the more the positive google review ratings, then there would be increased revenue, and the vice versa.
#Q3: Average Patient Age, Number of Patients Served per year, Average Patient Cost and Google Review Rating are the significant variables with p-value = 0.05.I think Visitors Allowed after 8PM should have been a significant variable since the more the visitors, the more the revenue.
#Q4: A unit increase in the total number of patients served annually leads to a 6.1931 on the total revenue, and vice versa. It makes sense since after every year, total year revenue is calculated.
#Q5: R2 = 0.779. It means that 77.9% of the variation of the expected revenue is explained by the predictor variables.