## Read Data

In [38]:
def process_data(df, year):
    
    df.rename(columns={'Name of institution': 'Institution Name'}, inplace=True)
    df.rename(columns={'Sector name': 'Sector Name'}, inplace=True)
    df.rename(columns={'Calendar system': 'Calendar System'}, inplace=True)
    
    df['Year'] = year[:4]
    df.rename(columns={year: 'Cost'}, inplace=True)
    
    df.drop(['UnitID', 'OPEID', 'List C: High percent change tuition and fee indicator', 'Percent change'], axis=1, inplace=True)
    
    return df

In [39]:
import pandas as pd 
import glob

xls_path = './data/*.xls'
xlsx_path = './data/*.xlsx'

xls_files = glob.glob(xls_path)
xlsx_files = glob.glob(xlsx_path)

dfs = []
years = {0: '2008-09 Tuition and fees', 
         1: '2009-10 Tuition and fees',
         2: '2010-11 Tuition and fees',
         3: '2011-12 Tuition and fees',
         4: '2012-13 Tuition and fees',
         5: '2013-14 Tuition and fees',
         6: '2014-15 Tuition and fees'}

for file in xls_files: 
    print(f'File: {file.split("/")[-1]}')
    df = pd.read_excel(file, sheet_name='TuitionChange')         
    
    for k, year in years.items():
        if year in df.columns:
            df = process_data(df, year)
            df.drop([years[k+2]], axis=1, inplace=True)
    
    print(f'Empty Counts: {df.isnull().sum()}\n')
    dfs.append(df)
    
for file in xlsx_files:
    print(f'File: {file.split("/")[-1]}')
    df1 = pd.read_excel(file, sheet_name='TuitionChange') 
    df2 = df1.copy()
    
    for k, year in years.items():
        if year in df1.columns:
            df1.drop([years[k+2]], axis=1, inplace=True)
            df1 = process_data(df1, year)
            df2.drop([year], axis=1, inplace=True)
            df2 = process_data(df2, years[k+2])
    
    print(f'Empty Counts 1: {df1.isnull().sum()}\n')      
    dfs.append(df1)
    print(f'Empty Counts 2: {df2.isnull().sum()}\n')
    dfs.append(df2)
        
data = pd.concat(dfs, axis=0, ignore_index=True)

data

File: data\CATClists2010.xls
Empty Counts: Sector                0
Sector Name           0
Institution Name      0
State                 0
Calendar System       0
Cost                141
Year                  0
dtype: int64

File: data\CATClists2011.xls
Empty Counts: Sector                0
Sector Name           0
Institution Name      0
State                 0
Calendar System       0
Cost                159
Year                  0
dtype: int64

File: data\CATClists2012.xls
Empty Counts: Sector                0
Sector Name           0
Institution Name      0
State                 0
Calendar System       0
Cost                179
Year                  0
dtype: int64

File: data\CATClists2013.xlsx
Empty Counts 1: Sector                0
Sector Name           0
Institution Name      0
State                 0
Calendar System       0
Cost                120
Year                  0
dtype: int64

Empty Counts 2: Sector              0
Sector Name         0
Institution Name    0
State          

Unnamed: 0,Sector,Sector Name,Institution Name,State,Calendar System,Cost,Year
0,1,"4-year, public",University of the District of Columbia,DC,Academic,3140.0,2008
1,1,"4-year, public",Escuela de Artes Plasticas de Puerto Rico,PR,Academic,2728.0,2008
2,1,"4-year, public",Lake Washington Technical College,WA,Academic,1892.0,2008
3,1,"4-year, public",University of Puerto Rico-Aguadilla,PR,Academic,1747.0,2008
4,1,"4-year, public",University of Puerto Rico-Ponce,PR,Academic,1747.0,2008
...,...,...,...,...,...,...,...
47617,9,"Less than 2-year, private for-profit",Ross Medical Education Center-Erlanger,KY,Program,15680.0,2014
47618,9,"Less than 2-year, private for-profit",Ross Medical Education Center-Evansville,IN,Program,15680.0,2014
47619,9,"Less than 2-year, private for-profit",Ross Medical Education Center-Johnson City,TN,Program,15680.0,2014
47620,9,"Less than 2-year, private for-profit",Ross Medical Education Center-Owensboro,KY,Program,15680.0,2014


## Clean Data

In [45]:
data.isnull().sum()

Sector                0
Sector Name           0
Institution Name      0
State                 0
Calendar System       0
Cost                656
Year                  0
dtype: int64

In [46]:
null_data = data[data['Cost'].isnull()].groupby('Institution Name').size().sort_values(ascending=False)
print(null_data)

null_data.value_counts()

Institution Name
Miller-Motte Technical College                     4
Webb Institute                                     4
Anthem College-Atlanta                             3
Midwest Technical Institute                        3
Susquehanna County Career and Technology Center    2
                                                  ..
ITT Technical Institute–Akron                      1
ITT Technical Institute-West Palm Beach            1
ITT Technical Institute-West Chester               1
ITT Technical Institute-University Park            1
Yeshiva College of the Nations Capital             1
Length: 511, dtype: int64


1    372
2    135
4      2
3      2
Name: count, dtype: int64

In [55]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder


string_cols = ['Institution Name']
encoder = LabelEncoder()

for col in string_cols:
    data.loc[:, col + ' Encoded'] = encoder.fit_transform(data.loc[:, col])
    
data_missing = data[data['Cost'].isnull()]
data_complete = data.dropna()

X_train = data_complete[['Year', 'Institution Name Encoded']]
print(f'{X_train.isnull().sum()}\n')
y_train = data_complete['Cost']
print(f'{y_train.isnull().sum()}\n')

X_test = data_missing[['Year', 'Institution Name Encoded']]
print(f'{X_test.isnull().sum()}\n')

model = LinearRegression()
model.fit(X_train, y_train)

predicted_values = model.predict(X_test)

data.loc[data['Cost'].isnull(), 'Cost'] = predicted_values

Year                        0
Institution Name Encoded    0
dtype: int64

0

Year                        0
Institution Name Encoded    0
dtype: int64



In [56]:
data.isnull().sum()

Sector                      0
Sector Name                 0
Institution Name            0
State                       0
Calendar System             0
Cost                        0
Year                        0
Institution Name Encoded    0
dtype: int64

In [4]:
data_clean = data.dropna()

null_indexes = data_clean[data_clean.isnull().any(axis=1)].index

print(f'Rows with null values: {len(null_indexes)}\n')

Rows with null values: 0



In [5]:
from sklearn.preprocessing import LabelEncoder

string_cols = ['Sector Name', 'State', 'Institution Name']

encoder = LabelEncoder()

for col in string_cols:
    data_clean.loc[:, col + ' Encoded'] = encoder.fit_transform(data_clean.loc[:, col])

# data_encoded = pd.get_dummies(data_clean, columns=string_cols)
    
# data_encoded
data_encoded = data_clean.drop(columns=['Sector Name', 'State', 'Institution Name'])
data_encoded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean.loc[:, col + ' Encoded'] = encoder.fit_transform(data_clean.loc[:, col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean.loc[:, col + ' Encoded'] = encoder.fit_transform(data_clean.loc[:, col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean.loc[:, col + ' Encoded'] =

Unnamed: 0,Sector,Cost,High Cost,Low Cost,Year,Sector Name Encoded,State Encoded,Institution Name Encoded
0,1,15250.0,1,0,2010,5,43,3225
1,1,14936.0,1,0,2010,5,43,4584
2,1,14066.0,1,0,2010,5,54,4640
3,1,13672.0,1,0,2010,5,35,4422
4,1,13630.0,1,0,2010,5,23,3913
...,...,...,...,...,...,...,...,...
21068,9,6480.0,0,1,2014,6,44,2024
21069,9,6422.0,0,1,2014,6,44,78
21070,9,6279.0,0,1,2014,6,10,1517
21071,9,6139.0,0,1,2014,6,44,3482


## Prediction Analysis

In [32]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
# https://realpython.com/linear-regression-in-python/

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# X = data_encoded.drop(columns=['Cost'])
# X = data_encoded[['Year', 'Name of institution_encoded']]
# y = data_encoded['Cost']

# need to split by taking first years as train and last year as test / this splitting is wrong
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Years: {data_encoded['Year'].unique()}')
print(f'data: {len(data_encoded)}')
train = data_encoded[data_encoded['Year'] != '2014']
test = data_encoded[data_encoded['Year'] == '2014']

# X_train = train.drop(columns=['Cost'])
X_train = train[['Year', 'State Encoded', 'Institution Name Encoded', 'Sector']]
y_train = train['Cost']
print(f'year: {X_train['Year'].unique()}, x_train: {len(X_train)}, y_train: {len(y_train)}')

# X_test = test.drop(columns=['Cost'])
X_test = test[['Year', 'State Encoded', 'Institution Name Encoded', 'Sector']]
y_test = test['Cost']
print(f'year: {X_test['Year'].unique()}, x_test: {len(X_test)}, y_test: {len(y_test)}')


model = LinearRegression()
model.fit(X_train, y_train)

predicted = pd.Series(model.predict(X_test), index=X_test.index)


Years: ['2010' '2011' '2012' '2013' '2014']
data: 21070
year: ['2010' '2011' '2012' '2013'], x_train: 16930, y_train: 16930
year: ['2014'], x_test: 4140, y_test: 4140


In [33]:
coefficients = model.coef_

coefficients_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': coefficients})

coefficients_df['Absolute Coefficient'] = coefficients_df['Coefficient'].abs()
coefficients_df.sort_values(by='Absolute Coefficient', ascending=False, inplace=True)

coefficients_df

Unnamed: 0,Feature,Coefficient,Absolute Coefficient
3,Sector,-1251.481186,1251.481186
0,Year,447.265121,447.265121
1,State Encoded,14.5485,14.5485
2,Institution Name Encoded,-0.268117,0.268117


In [34]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

mse = mean_squared_error(y_test, predicted)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) score:", r2)

Mean Squared Error (MSE): 119784345.56479475
Root Mean Squared Error (RMSE): 10944.60349052421
Mean Absolute Error (MAE): 8794.29027514099
R-squared (R2) score: 0.04809673761414335


In [38]:
np.asarray(test)

array([[1, 17772.0, 1, ..., 5, 43, 4584],
       [1, 17656.0, 1, ..., 5, 52, 896],
       [1, 17502.0, 1, ..., 5, 43, 3225],
       ...,
       [9, 6279.0, 0, ..., 6, 10, 1517],
       [9, 6139.0, 0, ..., 6, 44, 3482],
       [9, 2550.0, 0, ..., 6, 10, 2046]], dtype=object)

In [36]:
from statsmodels.tsa.arima.model import ARIMA

model = ARIMA(train, order=(1,1,1))
fitted_model = model.fit()

forecast_values = fitted_model.forecast(steps=len(test))

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

## Add Data to DB

### Postgres

In [123]:
import psycopg2

# Define your connection parameters
db_host = 'localhost'
db_port = '5432'
db_name = 'tuition'
db_user = 'my_user'
db_password = 'password'

# Establish a connection to the PostgreSQL database
try:
    conn = psycopg2.connect(
        host=db_host,
        port=db_port,
        database=db_name,
        user=db_user,
        password=db_password
    )
    print("Connected to the database")
    
    cursor = conn.cursor()
    
    cursor.execute("SELECT version();")
    
    db_version = cursor.fetchone()
    print("PostgreSQL database version:", db_version)
    
except psycopg2.Error as e:
    print("Error connecting to PostgreSQL:", e) 

Connected to the database
PostgreSQL database version: ('PostgreSQL 16.2 (Debian 16.2-1.pgdg120+2) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit',)


In [116]:
create_table_query = '''
CREATE TABLE IF NOT EXISTS Tuition (
    institution VARCHAR(100) NOT NULL,
    year INTEGER NOT NULL,
    sector INTEGER NOT NULL,
    sector_name VARCHAR(100) NOT NULL,
    state VARCHAR(25) NOT NULL,
    high_cost BOOLEAN NOT NULL,
    low_cost BOOLEAN NOT NULL,
    cost INTEGER NOT NULL
);
'''

cursor.execute(create_table_query)

conn.commit()

In [125]:
query = """
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = 'public';  -- Assuming tables are in the public schema
"""

# Execute the SQL query
cursor.execute(query)

# Fetch the results
rows = cursor.fetchall()

# Print the table names
for row in rows:
    print(row[0])

tuition
Tuition


In [118]:
from sqlalchemy import create_engine

DATABASE_URL = "postgresql://my_user:password@localhost:5432/tuition"

engine = create_engine(DATABASE_URL)

table_name = 'Tuition'
data_clean.to_sql(table_name, engine, if_exists='replace', index=False)

engine.dispose()

In [124]:
query = "SELECT * FROM Tuition LIMIT 100;"

cursor.execute(query)

rows = cursor.fetchall()

for row in rows:
    print(row)

In [122]:
cursor.close()
conn.close()

### Mysql

In [None]:
# https://dev.mysql.com/doc/connector-python/en/connector-python-example-connecting.html

import mysql.connector
from mysql.connector import errorcode

try:
  cnx = mysql.connector.connect(user='root',
                                password='password',
                                database='tuition')
except mysql.connector.Error as err:
  if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
    print("Something is wrong with your user name or password")
  elif err.errno == errorcode.ER_BAD_DB_ERROR:
    print("Database does not exist")
  else:
    print(err)
    
cursor = cnx.cursor()

print("DATABASES:")
cursor.execute("SHOW DATABASES")
for x in cursor:
    print(x)
    
print("\nTABLES:")    
cursor.execute("SHOW TABLES")
for x in cursor: 
    print(x)

DATABASES:
('information_schema',)
('mysql',)
('performance_schema',)
('sys',)
('tuition',)

TABLES:


In [None]:
# create_table = ("CREATE TABLE `Wiki_Edit` ("
#     " `RevisionID` INT PRIMARY KEY,"
#     " `ArticleName` VARCHAR(500),"
#     " `EditDate` DATE,"
#     " `UserName` VARCHAR(50))")

# try:
#     print(f'Creating Table "Wiki_Edit"')
#     cursor.execute(create_table)
# except mysql.connector.Error as err:
#     if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
#         print("already exists.")
#     else:
#         print(err.msg)
# else:
#     print("OK")