## Read Data

In [1]:
def process_data(df, year):
    
    df.rename(columns={'Name of institution': 'Institution Name'}, inplace=True)
    df.rename(columns={'Sector name': 'Sector Name'}, inplace=True)
    df.rename(columns={'Calendar system': 'Calendar System'}, inplace=True)
    
    df['Year'] = year[:4]
    df.rename(columns={year: 'Cost'}, inplace=True)
    
    df.drop(['UnitID', 'OPEID', 'List C: High percent change tuition and fee indicator', 'Percent change'], axis=1, inplace=True)
    
    return df

In [2]:
import pandas as pd 
import glob

xls_path = './data/*.xls'
xlsx_path = './data/*.xlsx'

xls_files = glob.glob(xls_path)
xlsx_files = glob.glob(xlsx_path)

dfs = []
years = {0: '2008-09 Tuition and fees', 
         1: '2009-10 Tuition and fees',
         2: '2010-11 Tuition and fees',
         3: '2011-12 Tuition and fees',
         4: '2012-13 Tuition and fees',
         5: '2013-14 Tuition and fees',
         6: '2014-15 Tuition and fees'}

for file in xls_files: 
    print(f'File: {file.split("/")[-1]}')
    df = pd.read_excel(file, sheet_name='TuitionChange')         
    
    for k, year in years.items():
        if year in df.columns:
            df = process_data(df, year)
            df.drop([years[k+2]], axis=1, inplace=True)
    
    print(f'Empty Counts: {df.isnull().sum()}\n')
    dfs.append(df)
    
for file in xlsx_files:
    print(f'File: {file.split("/")[-1]}')
    df1 = pd.read_excel(file, sheet_name='TuitionChange') 
    df2 = df1.copy()
    
    for k, year in years.items():
        if year in df1.columns:
            df1.drop([years[k+2]], axis=1, inplace=True)
            df1 = process_data(df1, year)
            df2.drop([year], axis=1, inplace=True)
            df2 = process_data(df2, years[k+2])
    
    print(f'Empty Counts 1: {df1.isnull().sum()}\n')      
    dfs.append(df1)
    print(f'Empty Counts 2: {df2.isnull().sum()}\n')
    dfs.append(df2)
        
data = pd.concat(dfs, axis=0, ignore_index=True)

data

File: CATClists2010.xls
Empty Counts: Sector                0
Sector Name           0
Institution Name      0
State                 0
Calendar System       0
Cost                141
Year                  0
dtype: int64

File: CATClists2011.xls
Empty Counts: Sector                0
Sector Name           0
Institution Name      0
State                 0
Calendar System       0
Cost                159
Year                  0
dtype: int64

File: CATClists2012.xls
Empty Counts: Sector                0
Sector Name           0
Institution Name      0
State                 0
Calendar System       0
Cost                179
Year                  0
dtype: int64

File: CATClists2013.xlsx
Empty Counts 1: Sector                0
Sector Name           0
Institution Name      0
State                 0
Calendar System       0
Cost                120
Year                  0
dtype: int64

Empty Counts 2: Sector              0
Sector Name         0
Institution Name    0
State               0
Calendar Syst

Unnamed: 0,Sector,Sector Name,Institution Name,State,Calendar System,Cost,Year
0,1,"4-year, public",University of the District of Columbia,DC,Academic,3140.0,2008
1,1,"4-year, public",Escuela de Artes Plasticas de Puerto Rico,PR,Academic,2728.0,2008
2,1,"4-year, public",Lake Washington Technical College,WA,Academic,1892.0,2008
3,1,"4-year, public",University of Puerto Rico-Aguadilla,PR,Academic,1747.0,2008
4,1,"4-year, public",University of Puerto Rico-Ponce,PR,Academic,1747.0,2008
...,...,...,...,...,...,...,...
47617,9,"Less than 2-year, private for-profit",Ross Medical Education Center-Erlanger,KY,Program,15680.0,2014
47618,9,"Less than 2-year, private for-profit",Ross Medical Education Center-Evansville,IN,Program,15680.0,2014
47619,9,"Less than 2-year, private for-profit",Ross Medical Education Center-Johnson City,TN,Program,15680.0,2014
47620,9,"Less than 2-year, private for-profit",Ross Medical Education Center-Owensboro,KY,Program,15680.0,2014


## Clean Data

In [3]:
data.isnull().sum()

Sector                0
Sector Name           0
Institution Name      0
State                 0
Calendar System       0
Cost                656
Year                  0
dtype: int64

In [4]:
null_data = data[data['Cost'].isnull()].groupby('Institution Name').size().sort_values(ascending=False)
print(null_data)

null_data.value_counts()

Institution Name
Miller-Motte Technical College             4
Webb Institute                             4
Midwest Technical Institute                3
Anthem College-Atlanta                     3
Chamberlain College of Nursing-Virginia    2
                                          ..
ITT Technical Institute-Southfield         1
ITT Technical Institute-Salem              1
ITT Technical Institute-Philadelphia       1
ITT Technical Institute-Overland Park      1
ITT Technical Institute–Culver City        1
Length: 511, dtype: int64


1    372
2    135
3      2
4      2
Name: count, dtype: int64

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

clean_data = data.copy()

string_cols = ['Institution Name']
encoder = LabelEncoder()

for col in string_cols:
    clean_data.loc[:, col + ' Encoded'] = encoder.fit_transform(clean_data.loc[:, col])
    
data_missing = clean_data[clean_data['Cost'].isnull()]
data_complete = clean_data.dropna()

X_train = data_complete[['Year', 'Institution Name Encoded']]
print(f'{X_train.isnull().sum()}\n')
y_train = data_complete['Cost']
print(f'{y_train.isnull().sum()}\n')

X_test = data_missing[['Year', 'Institution Name Encoded']]
print(f'{X_test.isnull().sum()}\n')

model = LinearRegression()
model.fit(X_train, y_train)

predicted_values = model.predict(X_test)

clean_data.loc[clean_data['Cost'].isnull(), 'Cost'] = predicted_values
clean_data['Predicted'] = 0
clean_data['Year'] = pd.to_datetime(clean_data['Year'])

Year                        0
Institution Name Encoded    0
dtype: int64

0

Year                        0
Institution Name Encoded    0
dtype: int64



In [6]:
data.isnull().sum()

Sector                0
Sector Name           0
Institution Name      0
State                 0
Calendar System       0
Cost                656
Year                  0
dtype: int64

## Prediction Analysis

In [7]:
from statsmodels.tsa.arima.model import ARIMA

for name in clean_data['Institution Name'].unique():
    institution_data = clean_data[clean_data['Institution Name'] == name]
    institution_data.set_index('Year', inplace=True)
    cost = institution_data['Cost']
    print(f'{name}: {cost}')

    model = ARIMA(cost, order=(1,1,1)).fit()
    forecast = model.forecast(steps=5)
    print(f'{name}: {forecast}\n')

University of the District of Columbia: Year
2008-01-01    3140.0
2009-01-01    5370.0
2010-01-01    7000.0
2011-01-01    5000.0
2013-01-01    5138.0
2012-01-01    5128.0
2014-01-01    5251.0
Name: Cost, dtype: float64
University of the District of Columbia: 7     5250.841716
8     5250.683445
9     5250.525185
10    5250.366937
11    5250.208702
Name: predicted_mean, dtype: float64

Escuela de Artes Plasticas de Puerto Rico: Year
2008-01-01    2728.0
2009-01-01    3033.0
2010-01-01    4779.0
2011-01-01    4779.0
2013-01-01    3248.0
2012-01-01    3248.0
2014-01-01    3248.0
Name: Cost, dtype: float64
Escuela de Artes Plasticas de Puerto Rico: 7     3226.277666
8     3238.873662
9     3231.569699
10    3235.805003
11    3233.349103
Name: predicted_mean, dtype: float64

Lake Washington Technical College: Year
2008-01-01    1892.0
Name: Cost, dtype: float64
Lake Washington Technical College: 1    1892.0
2    1892.0
3    1892.0
4    1892.0
5    1892.0
Name: predicted_mean, dtype: float64


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
  np.inner(score_obs, score_obs) /
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates,

University of Puerto Rico-Aguadilla: 7     2083.240667
8     2108.143807
9     2117.797603
10    2121.539934
11    2122.990663
Name: predicted_mean, dtype: float64

University of Puerto Rico-Ponce: Year
2008-01-01    1747.0
2009-01-01    1815.0
2010-01-01    2683.0
2011-01-01    2751.0
2013-01-01    2019.0
2012-01-01    2819.0
2014-01-01    2019.0
Name: Cost, dtype: float64
University of Puerto Rico-Ponce: 7     2083.240667
8     2108.143807
9     2117.797603
10    2121.539934
11    2122.990663
Name: predicted_mean, dtype: float64

University of Puerto Rico-Cayey: Year
2008-01-01    1913.0
2009-01-01    3812.0
2011-01-01    4016.0
2013-01-01    2212.0
2012-01-01    2819.0
2014-01-01    2212.0
Name: Cost, dtype: float64


  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, 

University of Puerto Rico-Cayey: 6     2212.309447
7     2212.000072
8     2212.309375
9     2212.000144
10    2212.309303
Name: predicted_mean, dtype: float64

Georgia State University: Year
2008-01-01    5844.0
2009-01-01    8298.0
2010-01-01    8698.0
2011-01-01    9410.0
2013-01-01    8368.0
2012-01-01    9664.0
2014-01-01    8618.0
Name: Cost, dtype: float64
Georgia State University: 7     8618.323546
8     8618.000000
9     8618.323546
10    8618.000001
11    8618.323546
Name: predicted_mean, dtype: float64

University of Arizona: Year
2008-01-01     5542.0
2009-01-01     6855.0
2010-01-01     8237.0
2011-01-01    10035.0
2013-01-01    10391.0
2012-01-01    10035.0
2014-01-01    10957.0
Name: Cost, dtype: float64
University of Arizona: 7     12004.539466
8     12240.137989
9     12293.125648
10    12305.042922
11    12307.723195
Name: predicted_mean, dtype: float64

University of Puerto Rico-Humacao: Year
2008-01-01    1940.0
2009-01-01    2008.0
2010-01-01    2683.0
2011-01-01  

  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)

University of Puerto Rico-Rio Piedras: Year
2008-01-01    1272.0
2009-01-01    1810.0
2010-01-01    1878.0
2011-01-01    2746.0
2013-01-01    2082.0
2012-01-01    2819.0
2014-01-01    2019.0
Name: Cost, dtype: float64
University of Puerto Rico-Rio Piedras: 7     2536.642756
8     2019.008317
9     2536.634439
10    2019.016634
11    2536.626123
Name: predicted_mean, dtype: float64

Savannah State University: Year
2008-01-01    3848.0
2009-01-01    4774.0
2010-01-01    5624.0
2011-01-01    6032.0
2013-01-01    5415.0
2012-01-01    5290.0
2014-01-01    5556.0
Name: Cost, dtype: float64
Savannah State University: 7     5514.545871
8     5510.420536
9     5510.010000
10    5509.969145
11    5509.965080
Name: predicted_mean, dtype: float64

University of South Florida-St. Petersburg Campus: Year
2008-01-01    3183.0
2009-01-01    3713.0
Name: Cost, dtype: float64


IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed

In [None]:
# coefficients = model.coef_

# coefficients_df = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': coefficients})

# coefficients_df['Absolute Coefficient'] = coefficients_df['Coefficient'].abs()
# coefficients_df.sort_values(by='Absolute Coefficient', ascending=False, inplace=True)

# coefficients_df

In [None]:
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# import numpy as np

# mse = mean_squared_error(y_test, predicted)
# rmse = np.sqrt(mse)
# mae = mean_absolute_error(y_test, predicted)
# r2 = r2_score(y_test, predicted)

# print("Mean Squared Error (MSE):", mse)
# print("Root Mean Squared Error (RMSE):", rmse)
# print("Mean Absolute Error (MAE):", mae)
# print("R-squared (R2) score:", r2)

## Add Data to DB

### Postgres

In [13]:
# Connect to psql client: docker run -it --rm --network docker_my_network postgres:16 psql -h postgres -U postgres
# Use database: \c project

In [21]:
import psycopg2

# Define your connection parameters
db_host = 'localhost'
db_port = '5432'
db_name = 'project'
db_user = 'postgres'
db_password = 'password'

# Establish a connection to the PostgreSQL database
try:
    conn = psycopg2.connect(
        host=db_host,
        port=db_port,
        database=db_name,
        user=db_user,
        password=db_password
    )
    print("Connected to the database")
    
    cursor = conn.cursor()
    
    cursor.execute("SELECT version();")
    
    db_version = cursor.fetchone()
    print("PostgreSQL database version:", db_version)
    
except psycopg2.Error as e:
    print("Error connecting to PostgreSQL:", e) 

Connected to the database
PostgreSQL database version: ('PostgreSQL 16.2 (Debian 16.2-1.pgdg120+2) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit',)


In [22]:
create_table_query = '''
CREATE TABLE IF NOT EXISTS Tuition (
    institution VARCHAR(100) NOT NULL,
    year INTEGER NOT NULL,
    sector INTEGER NOT NULL,
    sector_name VARCHAR(100) NOT NULL,
    state VARCHAR(25) NOT NULL,
    high_cost BOOLEAN NOT NULL,
    low_cost BOOLEAN NOT NULL,
    cost INTEGER NOT NULL
);
'''

cursor.execute(create_table_query)

conn.commit()

In [23]:
query = """
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = 'public';  -- Assuming tables are in the public schema
"""

# Execute the SQL query
cursor.execute(query)

# Fetch the results
rows = cursor.fetchall()

# Print the table names
for row in rows:
    print(row[0])

tuition


In [24]:
from sqlalchemy import create_engine

DATABASE_URL = "postgresql://postgres:password@localhost:5432/project"

engine = create_engine(DATABASE_URL)

table_name = 'tuition' # or Tuition
clean_data.to_sql(table_name, engine, if_exists='replace', index=False)

engine.dispose()

In [25]:
query = "SELECT * FROM Tuition LIMIT 100;"

cursor.execute(query)

rows = cursor.fetchall()

for row in rows:
    print(row)

(1, '4-year, public', 'University of the District of Columbia', 'DC', 'Academic', 3140.0, datetime.datetime(2008, 1, 1, 0, 0), 7925, 0)
(1, '4-year, public', 'Escuela de Artes Plasticas de Puerto Rico', 'PR', 'Academic', 2728.0, datetime.datetime(2008, 1, 1, 0, 0), 2393, 0)
(1, '4-year, public', 'Lake Washington Technical College', 'WA', 'Academic', 1892.0, datetime.datetime(2008, 1, 1, 0, 0), 3952, 0)
(1, '4-year, public', 'University of Puerto Rico-Aguadilla', 'PR', 'Academic', 1747.0, datetime.datetime(2008, 1, 1, 0, 0), 7848, 0)
(1, '4-year, public', 'University of Puerto Rico-Ponce', 'PR', 'Academic', 1747.0, datetime.datetime(2008, 1, 1, 0, 0), 7855, 0)
(1, '4-year, public', 'University of Puerto Rico-Cayey', 'PR', 'Academic', 1913.0, datetime.datetime(2008, 1, 1, 0, 0), 7852, 0)
(1, '4-year, public', 'Georgia State University', 'GA', 'Academic', 5844.0, datetime.datetime(2008, 1, 1, 0, 0), 2833, 0)
(1, '4-year, public', 'University of Arizona', 'AZ', 'Academic', 5542.0, datetime

In [26]:
cursor.close()
conn.close()