## Read Data

In [None]:
def process_data(df, year):
    
    df.rename(columns={'Name of institution': 'Institution Name'}, inplace=True)
    df.rename(columns={'Sector name': 'Sector Name'}, inplace=True)
    df.rename(columns={'Calendar system': 'Calendar System'}, inplace=True)
    
    df['Year'] = year[:4]
    df.rename(columns={year: 'Cost'}, inplace=True)
    
    df.drop(['UnitID', 'OPEID', 'List C: High percent change tuition and fee indicator', 'Percent change'], axis=1, inplace=True)
    
    return df

In [None]:
import pandas as pd 
import glob

xls_path = './data/*.xls'
xlsx_path = './data/*.xlsx'

xls_files = glob.glob(xls_path)
xlsx_files = glob.glob(xlsx_path)

dfs = []
years = {0: '2008-09 Tuition and fees', 
         1: '2009-10 Tuition and fees',
         2: '2010-11 Tuition and fees',
         3: '2011-12 Tuition and fees',
         4: '2012-13 Tuition and fees',
         5: '2013-14 Tuition and fees',
         6: '2014-15 Tuition and fees'}

for file in xls_files: 
    print(f'File: {file.split("/")[-1]}')
    df = pd.read_excel(file, sheet_name='TuitionChange')         
    
    for k, year in years.items():
        if year in df.columns:
            df = process_data(df, year)
            df.drop([years[k+2]], axis=1, inplace=True)
    
    print(f'Empty Counts: {df.isnull().sum()}\n')
    dfs.append(df)
    
for file in xlsx_files:
    print(f'File: {file.split("/")[-1]}')
    df1 = pd.read_excel(file, sheet_name='TuitionChange') 
    df2 = df1.copy()
    
    for k, year in years.items():
        if year in df1.columns:
            df1.drop([years[k+2]], axis=1, inplace=True)
            df1 = process_data(df1, year)
            df2.drop([year], axis=1, inplace=True)
            df2 = process_data(df2, years[k+2])
    
    print(f'Empty Counts 1: {df1.isnull().sum()}\n')      
    dfs.append(df1)
    print(f'Empty Counts 2: {df2.isnull().sum()}\n')
    dfs.append(df2)
        
data = pd.concat(dfs, axis=0, ignore_index=True)

data

## Clean Data

In [None]:
data.isnull().sum()

In [None]:
null_data = data[data['Cost'].isnull()].groupby('Institution Name').size().sort_values(ascending=False)
print(null_data)

null_data.value_counts()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

clean_data = data.copy()

string_cols = ['Institution Name']
encoder = LabelEncoder()

for col in string_cols:
    clean_data.loc[:, col + ' Encoded'] = encoder.fit_transform(clean_data.loc[:, col])
    
data_missing = clean_data[clean_data['Cost'].isnull()]
data_complete = clean_data.dropna()

X_train = data_complete[['Year', 'Institution Name Encoded']]
print(f'{X_train.isnull().sum()}\n')
y_train = data_complete['Cost']
print(f'{y_train.isnull().sum()}\n')

X_test = data_missing[['Year', 'Institution Name Encoded']]
print(f'{X_test.isnull().sum()}\n')

model = LinearRegression()
model.fit(X_train, y_train)

predicted_values = model.predict(X_test)

clean_data.loc[clean_data['Cost'].isnull(), 'Cost'] = predicted_values
clean_data['Predicted'] = 0
#clean_data['Year'] = pd.to_datetime(clean_data['Year']) # Not sure we want this as a datetime

In [None]:
data.isnull().sum()

## Prediction Analysis

In [None]:
from statsmodels.tsa.arima.model import ARIMA

for name in clean_data['Institution Name'].unique():
    institution_data = clean_data[clean_data['Institution Name'] == name]
    institution_data.set_index('Year', inplace=True)
    cost = institution_data['Cost']
    print(f'{name}: {cost}')

    model = ARIMA(cost, order=(1,1,1)).fit()
    forecast = model.forecast(steps=5)
    print(f'{name}: {forecast}\n')

In [None]:
# coefficients = model.coef_

# coefficients_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': coefficients})

# coefficients_df['Absolute Coefficient'] = coefficients_df['Coefficient'].abs()
# coefficients_df.sort_values(by='Absolute Coefficient', ascending=False, inplace=True)

# coefficients_df

In [None]:
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# import numpy as np

# mse = mean_squared_error(y_test, predicted)
# rmse = np.sqrt(mse)
# mae = mean_absolute_error(y_test, predicted)
# r2 = r2_score(y_test, predicted)

# print("Mean Squared Error (MSE):", mse)
# print("Root Mean Squared Error (RMSE):", rmse)
# print("Mean Absolute Error (MAE):", mae)
# print("R-squared (R2) score:", r2)

## Add Data to DB

### Postgres

In [None]:
# Connect to psql client: docker run -it --rm --network docker_my_network postgres:16 psql -h postgres -U postgres
# Use database: \c project

In [None]:
import psycopg2

# Define your connection parameters
db_host = 'localhost'
db_port = '5432'
db_name = 'project'
db_user = 'postgres'
db_password = 'password'

# Establish a connection to the PostgreSQL database
try:
    conn = psycopg2.connect(
        host=db_host,
        port=db_port,
        database=db_name,
        user=db_user,
        password=db_password
    )
    print("Connected to the database")
    
    cursor = conn.cursor()
    
    cursor.execute("SELECT version();")
    
    db_version = cursor.fetchone()
    print("PostgreSQL database version:", db_version)
    
except psycopg2.Error as e:
    print("Error connecting to PostgreSQL:", e) 

In [None]:
# create_table_query = '''
# CREATE TABLE IF NOT EXISTS tuition (
#     institution VARCHAR(100) NOT NULL,
#     year SMALLINT NOT NULL,
#     sector SMALLINT NOT NULL,
#     sector_name VARCHAR(100) NOT NULL,
#     state VARCHAR(5) NOT NULL,
#     high_cost BOOLEAN NOT NULL,
#     low_cost BOOLEAN NOT NULL,
#     cost INTEGER NOT NULL
# );
# '''

# cursor.execute(create_table_query)

# conn.commit()

In [None]:
from sqlalchemy import create_engine

DATABASE_URL = "postgresql://postgres:password@localhost:5432/project"

engine = create_engine(DATABASE_URL)

table_name = 'tuition'
clean_data.to_sql(table_name, engine, if_exists='replace', index=False)

engine.dispose()

In [None]:
clean_data[:3]

In [None]:
query = "SELECT * FROM tuition LIMIT 100;"

cursor.execute(query)

rows = cursor.fetchall()

for row in rows:
    print(row)

In [None]:
cursor.close()
conn.close()