## Read Data

In [53]:
def process_data(df, year):
    
    df.rename(columns={'Name of institution': 'Institution Name'}, inplace=True)
    df.rename(columns={'Sector name': 'Sector Name'}, inplace=True)
    df.rename(columns={'Calendar system': 'Calendar System'}, inplace=True)
    
    df['Year'] = year[:4]
    df.rename(columns={year: 'Cost'}, inplace=True)
    
    df.drop(['OPEID', 'List C: High percent change tuition and fee indicator', 'Percent change'], axis=1, inplace=True)
    
    return df

In [54]:
import pandas as pd 
import glob

xls_path = './data/*.xls'
xlsx_path = './data/*.xlsx'

xls_files = glob.glob(xls_path)
xlsx_files = glob.glob(xlsx_path)

dfs = []
years = {0: '2008-09 Tuition and fees', 
         1: '2009-10 Tuition and fees',
         2: '2010-11 Tuition and fees',
         3: '2011-12 Tuition and fees',
         4: '2012-13 Tuition and fees',
         5: '2013-14 Tuition and fees',
         6: '2014-15 Tuition and fees'}

for file in xls_files: 
    print(f'File: {file.split("/")[-1]}')
    df = pd.read_excel(file, sheet_name='TuitionChange')         
    
    for k, year in years.items():
        if year in df.columns:
            df = process_data(df, year)
            df.drop([years[k+2]], axis=1, inplace=True)
    
    print(f'Empty Counts: {df.isnull().sum()}\n')
    dfs.append(df)
    
for file in xlsx_files:
    print(f'File: {file.split("/")[-1]}')
    df1 = pd.read_excel(file, sheet_name='TuitionChange') 
    df2 = df1.copy()
    
    for k, year in years.items():
        if year in df1.columns:
            df1.drop([years[k+2]], axis=1, inplace=True)
            df1 = process_data(df1, year)
            df2.drop([year], axis=1, inplace=True)
            df2 = process_data(df2, years[k+2])
    
    print(f'Empty Counts 1: {df1.isnull().sum()}\n')      
    dfs.append(df1)
    print(f'Empty Counts 2: {df2.isnull().sum()}\n')
    dfs.append(df2)
        
dfs[-2], dfs[-3] = dfs[-3], dfs[-2]

data = pd.concat(dfs, axis=0, ignore_index=True)


data

File: data\CATClists2010.xls
Empty Counts: Sector                0
Sector Name           0
UnitID                0
Institution Name      0
State                 0
Calendar System       0
Cost                141
Year                  0
dtype: int64

File: data\CATClists2011.xls
Empty Counts: Sector                0
Sector Name           0
UnitID                0
Institution Name      0
State                 0
Calendar System       0
Cost                159
Year                  0
dtype: int64

File: data\CATClists2012.xls
Empty Counts: Sector                0
Sector Name           0
UnitID                0
Institution Name      0
State                 0
Calendar System       0
Cost                179
Year                  0
dtype: int64

File: data\CATClists2013.xlsx
Empty Counts 1: Sector                0
Sector Name           0
UnitID                0
Institution Name      0
State                 0
Calendar System       0
Cost                120
Year                  0
dtype: int64

E

Unnamed: 0,Sector,Sector Name,UnitID,Institution Name,State,Calendar System,Cost,Year
0,1,"4-year, public",131399,University of the District of Columbia,DC,Academic,3140.0,2008
1,1,"4-year, public",241951,Escuela de Artes Plasticas de Puerto Rico,PR,Academic,2728.0,2008
2,1,"4-year, public",235699,Lake Washington Technical College,WA,Academic,1892.0,2008
3,1,"4-year, public",243106,University of Puerto Rico-Aguadilla,PR,Academic,1747.0,2008
4,1,"4-year, public",243212,University of Puerto Rico-Ponce,PR,Academic,1747.0,2008
...,...,...,...,...,...,...,...,...
47617,9,"Less than 2-year, private for-profit",481890,Ross Medical Education Center-Erlanger,KY,Program,15680.0,2014
47618,9,"Less than 2-year, private for-profit",484349,Ross Medical Education Center-Evansville,IN,Program,15680.0,2014
47619,9,"Less than 2-year, private for-profit",484358,Ross Medical Education Center-Johnson City,TN,Program,15680.0,2014
47620,9,"Less than 2-year, private for-profit",484330,Ross Medical Education Center-Owensboro,KY,Program,15680.0,2014


## Clean Data

In [55]:
data.isnull().sum()

Sector                0
Sector Name           0
UnitID                0
Institution Name      0
State                 0
Calendar System       0
Cost                656
Year                  0
dtype: int64

In [56]:
null_data = data[data['Cost'].isnull()].groupby('Institution Name').size().sort_values(ascending=False)
print(null_data)

null_data.value_counts()

Institution Name
Miller-Motte Technical College                     4
Webb Institute                                     4
Anthem College-Atlanta                             3
Midwest Technical Institute                        3
Susquehanna County Career and Technology Center    2
                                                  ..
ITT Technical Institute–Akron                      1
ITT Technical Institute-West Palm Beach            1
ITT Technical Institute-West Chester               1
ITT Technical Institute-University Park            1
Yeshiva College of the Nations Capital             1
Length: 511, dtype: int64


1    372
2    135
4      2
3      2
Name: count, dtype: int64

In [127]:
# fill in null values

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

clean_data = data.copy()

string_cols = ['Institution Name', 'State']
encoder = LabelEncoder()

for col in string_cols:
    clean_data.loc[:, col + ' Encoded'] = encoder.fit_transform(clean_data.loc[:, col])
    
data_missing = clean_data[clean_data['Cost'].isnull()]
data_complete = clean_data.dropna()

X_train = data_complete[['Year', 'UnitID', 'Institution Name Encoded', 'State Encoded']]
y_train = data_complete['Cost']

X_test = data_missing[['Year', 'UnitID', 'Institution Name Encoded', 'State Encoded']]

model = LinearRegression()
model.fit(X_train, y_train)

predicted_values = model.predict(X_test)

clean_data.loc[clean_data['Cost'].isnull(), 'Cost'] = predicted_values
clean_data['Predicted'] = 0
clean_data['Year'] = pd.to_datetime(clean_data['Year'])

print(clean_data['Year'].unique())

<DatetimeArray>
['2008-01-01 00:00:00', '2009-01-01 00:00:00', '2010-01-01 00:00:00',
 '2011-01-01 00:00:00', '2012-01-01 00:00:00', '2013-01-01 00:00:00',
 '2014-01-01 00:00:00']
Length: 7, dtype: datetime64[ns]


In [58]:
start_year = clean_data['Year'].dt.year.min()
end_year = clean_data['Year'].dt.year.max()
years_range = range(start_year, end_year + 1)

institutions_missing_years = {}
colleges_with_gaps = {}
count = 0

grouped = clean_data.groupby(['Institution Name', 'State', 'UnitID'])
print(f'Total Institutions : {len(grouped)}')
print('Institutions Yearly Data Count')
print(grouped['Year'].nunique().value_counts().sort_index())
print()

institutions_with_gaps = []

for (name, state, unitid), group in grouped:
    years_present = group['Year'].dt.year
    gaps = years_present.diff().fillna(1).ne(1)
    
    if gaps.any():
        institutions_with_gaps.append((name, state, unitid, list(years_present)))

count = 0
for institution in institutions_with_gaps:
    if len(institution[3]) > 3:
        print(f"Institution: {institution[0]}, State: {institution[1]}, ID: {institution[2]}")
        print(f"Existing Years: {institution[3]}")
        print()
        count += 1
        
print(f'Fillable Years : {count}')

Total Institutions : 8696
Institutions Yearly Data Count
Year
1     451
2     889
3     570
4     756
5     610
6     331
7    5089
Name: count, dtype: int64

Institution: Academy of Aesthetic Arts, State: KS, ID: 461616
Existing Years: [2009, 2010, 2011, 2013]

Institution: Academy of Massage Therapy, State: NJ, ID: 437501
Existing Years: [2008, 2009, 2010, 2011, 2013]

Institution: Acadiana Technical College-Lafayette Campus, State: LA, ID: 159443
Existing Years: [2008, 2009, 2010, 2011, 2013]

Institution: Acupuncture and Massage College, State: FL, ID: 439969
Existing Years: [2008, 2009, 2011, 2013]

Institution: Allied Health Careers, State: TX, ID: 367316
Existing Years: [2008, 2009, 2010, 2011, 2013]

Institution: Altamaha Technical College, State: GA, ID: 366447
Existing Years: [2008, 2009, 2010, 2011, 2013]

Institution: American Advanced Technicians Institute, State: FL, ID: 444370
Existing Years: [2008, 2011, 2012, 2013, 2014]

Institution: American Auto Institute, State: CA

### **Do the next section if worth creating 387 rows from step above...**

In [None]:
# names = clean_data['Institution Name'].unique()

# rows = []

# earliest = clean_data['Year'].unique().min()
# latest = clean_data['Year'].unique().max()
# years = clean_data['Year'].unique()

# for name in names:
#     name_data = clean_data.loc[clean_data['Institution Name'] == name]
    
#     if len(name_data) < 5:
#         continue
    
#     for year in years:
#         if year not in name_data['Year'].values:
#             if year == earliest or year == latest:
#                 continue
            
#             print(name_data)
            
#             prev_row = name_data.loc[name_data['Year'] == year - pd.DateOffset(years=1)]
#             next_row = name_data.loc[name_data['Year'] == year + pd.DateOffset(years=1)]
            
#             if prev_row.empty:
#                 prev_row = name_data.loc[name_data['Year'] == year - pd.DateOffset(years=2)]
        
#             if next_row.empty:
#                 next_row = name_data.loc[name_data['Year'] == year + pd.DateOffset(years=2)]
                                                                                   
#             prev_row = prev_row.iloc[0]
#             next_row = next_row.iloc[0]
#             cost = (prev_row ['Cost'] + next_row['Cost']) / 2 
#             row = prev_row
#             row['Year'] = year
#             row['Cost'] = cost
#             row['Predicted'] = 1
#             rows.append(row)

In [None]:
# final_data = pd.concat([clean_data] + rows, ignore_index=True)

In [None]:
# final_data.isnull().sum()

### Check Linearity

In [84]:
data_df = data.copy()
data_df['Prediction'] = 0
data_df.dropna(inplace=True)


In [85]:
data_df.isnull().sum()

Sector              0
Sector Name         0
UnitID              0
Institution Name    0
State               0
Calendar System     0
Cost                0
Year                0
Prediction          0
dtype: int64

In [86]:
def check_linearity(df):
    results = {}
    for (name, id), group in df.groupby(['Institution Name', 'UnitID']):
        X = group['Year'].values.reshape(-1, 1)
        y = group['Cost'].values
        
        if len(y) < 4:
            continue

        model = LinearRegression()
        model.fit(X, y)

        # Check linearity by comparing R-squared
        results[id] = model.score(X, y)
        
    return results

linear = 0
linear_ids = []
non_linear = 0
linearity = check_linearity(data_df)
for k, v in linearity.items():
    if v < 0.7:
        non_linear+=1
    else:
        linear+=1
        linear_ids.append(k)
        
print(f'Linear : {linear}, Non Linear : {non_linear}')

Linear : 4839, Non Linear : 1839


In [88]:
linear_df = data_df[data_df['UnitID'].isin(linear_ids)]
linear_df

Unnamed: 0,Sector,Sector Name,UnitID,Institution Name,State,Calendar System,Cost,Year,Prediction
2,1,"4-year, public",235699,Lake Washington Technical College,WA,Academic,1892.0,2008,0
7,1,"4-year, public",104179,University of Arizona,AZ,Academic,5542.0,2008,0
12,1,"4-year, public",448840,University of South Florida-St. Petersburg Campus,FL,Academic,3183.0,2008,0
13,1,"4-year, public",139959,University of Georgia,GA,Academic,6030.0,2008,0
14,1,"4-year, public",228705,Texas A & M University-Kingsville,TX,Academic,4386.0,2008,0
...,...,...,...,...,...,...,...,...,...
47551,9,"Less than 2-year, private for-profit",467544,Regency Beauty Institute-Jacksonville Regency,FL,Program,13550.0,2014,0
47552,9,"Less than 2-year, private for-profit",475547,Paul Mitchell the School-Fort Myers,FL,Program,13500.0,2014,0
47554,9,"Less than 2-year, private for-profit",462336,International College of Beauty Arts & Sciences,CA,Program,16100.0,2014,0
47555,9,"Less than 2-year, private for-profit",455008,Tricoci University of Beauty Culture-Rockford,IL,Program,16900.0,2014,0


In [113]:
from sklearn.metrics import mean_squared_error
import numpy as np

mses = {}
future_rows = []
for (name, id), group in linear_df.groupby(['Institution Name', 'UnitID']):  
    # Test model
    X = group['Year'].values.reshape(-1, 1) 
    y = group['Cost'].values
    
    if len(y) < 4:
        continue
    
    X_train = X[:-1]
    y_train = y[:-1]
    X_test = X[-1].reshape(1, -1)
    y_test = y[-1].reshape(1, -1)
    model = LinearRegression()
    model.fit(X_train, y_train) 
    
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    # print(f'Pred : {y_pred}, Actual : {y_test}')
    mses[id] = mse
    
    # future predictions
    year = int(X_test[0][0]) + 1
    future_year = np.array([[year]])
    future_pred = model.predict(future_year)
    row = group.iloc[0].copy()
    row['Year'] = str(year)
    row['Cost'] = future_pred[0]
    row['Prediction'] = 1
    future_rows.append(row)

In [94]:
has_nan = any(series.isna().any() for series in future_rows)

if has_nan:
    print("At least one Series contains NaN values.")
else:
    print("No Series contain NaN values.")

No Series contain NaN values.


In [96]:
average = sum(mses.values()) / len(mses)

print(f'Average MSE : {average}')

Average MSE : 1019590.6451606423


In [97]:
data_df.isnull().sum()

Sector              0
Sector Name         0
UnitID              0
Institution Name    0
State               0
Calendar System     0
Cost                0
Year                0
Prediction          0
dtype: int64

In [122]:
pred_df = pd.DataFrame(future_rows)
pred_df

Unnamed: 0,Sector,Sector Name,UnitID,Institution Name,State,Calendar System,Cost,Year,Prediction
5302,9,"Less than 2-year, private for-profit",457590,A & W Healthcare Educators,LA,Program,24371.428571,2015,1
5338,9,"Less than 2-year, private for-profit",106281,ABC Beauty College Inc,AR,Program,12999.257143,2015,1
1384,2,"4-year, private not-for-profit",152822,AIB College of Business,IA,Academic,15753.285714,2015,1
17998,6,"2-year, private for-profit",404994,ASA College,NY,Academic,12320.200000,2015,1
19675,9,"Less than 2-year, private for-profit",475431,ASI Career Institute,NJ,Program,11500.000000,2015,1
...,...,...,...,...,...,...,...,...,...
6034,9,"Less than 2-year, private for-profit",413820,Yukon Beauty College Inc,OK,Program,12131.428571,2015,1
5206,9,"Less than 2-year, private for-profit",451237,Z Hair Academy,KS,Program,16112.085714,2015,1
3377,4,"2-year, public",204255,Zane State College,OH,Academic,4885.466667,2015,1
3740,5,"2-year, private not-for-profit",184357,duCret School of Arts,NJ,Academic,9574.619048,2015,1


In [123]:
pred_df.isnull().sum()

Sector              0
Sector Name         0
UnitID              0
Institution Name    0
State               0
Calendar System     0
Cost                0
Year                0
Prediction          0
dtype: int64

In [124]:
pred_df.dtypes

Sector                int64
Sector Name          object
UnitID                int64
Institution Name     object
State                object
Calendar System      object
Cost                float64
Year                 object
Prediction            int64
dtype: object

In [116]:
data_df.dtypes

Sector                int64
Sector Name          object
UnitID                int64
Institution Name     object
State                object
Calendar System      object
Cost                float64
Year                 object
Prediction            int64
dtype: object

In [125]:
test_df = pd.concat([data_df + pred_df], ignore_index=True)
test_df

Unnamed: 0,Sector,Sector Name,UnitID,Institution Name,State,Calendar System,Cost,Year,Prediction
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
46961,,,,,,,,,
46962,,,,,,,,,
46963,,,,,,,,,
46964,,,,,,,,,


In [103]:
test_df.isnull().sum()

Sector                  9
Sector Name             9
UnitID                  9
Institution Name        9
State                   9
                    ...  
6034                46966
5206                46966
3377                46966
3740                46966
26243               46966
Length: 4848, dtype: int64

## Prediction Analysis

In [None]:
# from statsmodels.tsa.arima.model import ARIMA

# pred_rows = []

# for (name, state, unitid), institution_data in grouped:
#     df = institution_data.copy()
#     pred = df.iloc[-1].copy()
    
#     if df['Year'].dt.year.diff().fillna(1).ne(1).any() or len(df) < 4:
#         continue
    
#     df.set_index('Year', inplace=True)
#     df.index = pd.DatetimeIndex(df.index, freq='infer')

#     forecast = ARIMA(df['Cost'], order=(1,0,0)).fit().forecast(steps=3)
    
#     for idx, cost in enumerate(forecast):
#         pred_row = pred.copy()
#         pred_row['Predicted'] = 1 
#         pred_row['Cost'] = cost
#         pred_row['Year'] = pred_row['Year'] + pd.DateOffset(years=idx+1)
#         pred_rows.append(pred_row)

# print(len(pred_rows))        

In [None]:
# clean_data = pd.concat([clean_data] + pred_rows, ignore_index=True)

## Add Data to DB

### Postgres

In [128]:
# Connect to psql client: docker run -it --rm --network docker_my_network postgres:16 psql -h postgres -U postgres
# Use database: \c project

In [None]:
# Change year to an integer
clean_data['Year'] = clean_data['Year'].dt.year

In [139]:
import psycopg2

# Define your connection parameters
db_host = 'localhost'
db_port = '5432'
db_name = 'project'
db_user = 'postgres'
db_password = 'password'

# Establish a connection to the PostgreSQL database
try:
    conn = psycopg2.connect(
        host=db_host,
        port=db_port,
        database=db_name,
        user=db_user,
        password=db_password
    )
    print("Connected to the database")
    
    cursor = conn.cursor()
    
    cursor.execute("SELECT version();")
    
    db_version = cursor.fetchone()
    print("PostgreSQL database version:", db_version)
    
except psycopg2.Error as e:
    print("Error connecting to PostgreSQL:", e) 

Connected to the database
PostgreSQL database version: ('PostgreSQL 16.2 (Debian 16.2-1.pgdg120+2) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit',)


In [130]:
# create_table_query = '''
# CREATE TABLE IF NOT EXISTS tuition (
#     institution VARCHAR(100) NOT NULL,
#     year SMALLINT NOT NULL,
#     sector SMALLINT NOT NULL,
#     sector_name VARCHAR(100) NOT NULL,
#     state VARCHAR(5) NOT NULL,
#     high_cost BOOLEAN NOT NULL,
#     low_cost BOOLEAN NOT NULL,
#     cost INTEGER NOT NULL
# );
# '''

# cursor.execute(create_table_query)

# conn.commit()

In [131]:
from sqlalchemy import create_engine

DATABASE_URL = "postgresql://postgres:password@localhost:5432/project"

engine = create_engine(DATABASE_URL)

table_name = 'tuition'
clean_data.to_sql(table_name, engine, if_exists='replace', index=False)

engine.dispose()

In [133]:
clean_data[:3]

In [134]:
query = "SELECT * FROM tuition LIMIT 100;"

cursor.execute(query)

rows = cursor.fetchall()

for row in rows:
    print(row)

In [140]:
query = "SELECT * FROM PredTuition LIMIT 100;"

cursor.execute(query)

rows = cursor.fetchall()

for row in rows:
    print(row)

In [137]:
cursor.close()
conn.close()