## Read Data

In [31]:
import pandas as pd 
import glob

path = './data/*.xls*'

files = glob.glob(path)

data = pd.DataFrame()
bad_col = 'Name of institutions'
good_col = 'Name of institution'
for file in files: 
    df = pd.read_excel(file, sheet_name='Tuition')
    if bad_col in df.columns:
        df.rename(columns={bad_col: good_col}, inplace=True)
    
    df.drop(['UnitID', 'OPEID'], axis=1, inplace=True)
    
    if data.empty:
        data = df
        continue
    
    df.drop(['Sector', 'Sector name', 'State', 'List A: High tuition and fee indicator', 'List E: Low tuition and fee indicator'], axis=1, inplace=True)
    
    data = pd.merge(data, df, on=good_col)

data

Unnamed: 0,Sector,Sector name,Name of institution,State,2010-11 Tuition and fees,List A: High tuition and fee indicator,List E: Low tuition and fee indicator,2011-12 Tuition and fees,2012-13 Tuition and fees,2013-14 Tuition and fees,2014-15 Tuition and fees
0,1,"4-year, public",Pennsylvania State University-Main Campus,PA,15250.0,1,0,15984.0,16444.0,16992,17502
1,1,"4-year, public",University of Pittsburgh-Pittsburgh Campus,PA,14936.0,1,0,16132.0,16590.0,17100,17772
2,1,"4-year, public",University of Vermont,VT,14066.0,1,0,14784.0,15284.0,15718,16226
3,1,"4-year, public",University of New Hampshire-Main Campus,NH,13672.0,1,0,15250.0,16422.0,16496,16552
4,1,"4-year, public",St Mary's College of Maryland,MD,13630.0,1,0,14445.0,14773.0,14864,13824
...,...,...,...,...,...,...,...,...,...,...,...
5035,9,"Less than 2-year, private for-profit",Industrial Technical College,PR,6330.0,0,1,6330.0,6330.0,6480,6480
5036,9,"Less than 2-year, private for-profit",American Educational College,PR,6274.0,0,1,6274.0,6294.0,6324,6422
5037,9,"Less than 2-year, private for-profit",Future-Tech Institute,FL,6200.0,0,1,6200.0,6588.0,6588,6279
5038,9,"Less than 2-year, private for-profit",South Texas Training Center,TX,5080.0,0,1,5080.0,8400.0,8400,10000


## Clean Data

In [32]:
null_counts = data.isnull().sum()

null_counts

Sector                                    0
Sector name                               0
Name of institution                       0
State                                     0
2010-11 Tuition and fees                  1
List A: High tuition and fee indicator    0
List E: Low tuition and fee indicator     0
2011-12 Tuition and fees                  1
2012-13 Tuition and fees                  1
2013-14 Tuition and fees                  0
2014-15 Tuition and fees                  0
dtype: int64

In [33]:
null_indexes = data[data.isnull().any(axis=1)].index

print(f'Rows with null values: {len(null_indexes)}\n')
for row in null_indexes:
    print(data.iloc[row])

Rows with null values: 1

Sector                                                                 2
Sector name                               4-year, private not-for-profit
Name of institution                                       Webb Institute
State                                                                 NY
2010-11 Tuition and fees                                             NaN
List A: High tuition and fee indicator                                -2
List E: Low tuition and fee indicator                                 -2
2011-12 Tuition and fees                                             NaN
2012-13 Tuition and fees                                             NaN
2013-14 Tuition and fees                                           42750
2014-15 Tuition and fees                                           44000
Name: 2776, dtype: object


In [48]:
data_clean = data.dropna()

null_indexes = data_clean[data_clean.isnull().any(axis=1)].index

print(f'Rows with null values: {len(null_indexes)}\n')

Rows with null values: 0



In [49]:
from sklearn.preprocessing import LabelEncoder

string_cols = ['Sector name', 'State']

encoder = LabelEncoder()

for col in string_cols:
    data_clean.loc[:, col + '_encoded'] = encoder.fit_transform(data_clean.loc[:, col])

# data_encoded = pd.get_dummies(data_clean, columns=string_cols)
    
# data_encoded
data_encoded = data_clean.drop(columns=['Sector name', 'State', 'Name of institution'])
data_encoded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean.loc[:, col + '_encoded'] = encoder.fit_transform(data_clean.loc[:, col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean.loc[:, col + '_encoded'] = encoder.fit_transform(data_clean.loc[:, col])


Unnamed: 0,Sector,2010-11 Tuition and fees,List A: High tuition and fee indicator,List E: Low tuition and fee indicator,2011-12 Tuition and fees,2012-13 Tuition and fees,2013-14 Tuition and fees,2014-15 Tuition and fees,Sector name_encoded,State_encoded
0,1,15250.0,1,0,15984.0,16444.0,16992,17502,5,43
1,1,14936.0,1,0,16132.0,16590.0,17100,17772,5,43
2,1,14066.0,1,0,14784.0,15284.0,15718,16226,5,54
3,1,13672.0,1,0,15250.0,16422.0,16496,16552,5,35
4,1,13630.0,1,0,14445.0,14773.0,14864,13824,5,23
...,...,...,...,...,...,...,...,...,...,...
5035,9,6330.0,0,1,6330.0,6330.0,6480,6480,6,44
5036,9,6274.0,0,1,6274.0,6294.0,6324,6422,6,44
5037,9,6200.0,0,1,6200.0,6588.0,6588,6279,6,10
5038,9,5080.0,0,1,5080.0,8400.0,8400,10000,6,50


## Prediction Analysis

In [51]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = data_encoded.drop(columns=['2014-15 Tuition and fees'])
y = data_encoded['2014-15 Tuition and fees']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

predicted = model.predict(X_test)


In [53]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

mse = mean_squared_error(y_test, predicted)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) score:", r2)

Mean Squared Error (MSE): 23569340.401116803
Root Mean Squared Error (RMSE): 4854.826505768955
Mean Absolute Error (MAE): 2435.277964949702
R-squared (R2) score: 0.8475140284626127


## Add Data to DB

### Postgres

In [3]:
import psycopg2

# Define your connection parameters
db_host = 'localhost'
db_port = '5432'
db_name = 'tuition'
db_user = 'my_user'
db_password = 'password'

# Establish a connection to the PostgreSQL database
try:
    conn = psycopg2.connect(
        host=db_host,
        port=db_port,
        database=db_name,
        user=db_user,
        password=db_password
    )
    print("Connected to the database")
    
    cursor = conn.cursor()
    
    cursor.execute("SELECT version();")
    
    db_version = cursor.fetchone()
    print("PostgreSQL database version:", db_version)
    
except psycopg2.Error as e:
    print("Error connecting to PostgreSQL:", e) 
    
cursor.close()
conn.close()

Error connecting to PostgreSQL: connection to server at "localhost" (::1), port 5432 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?



NameError: name 'cursor' is not defined

### Mysql

In [None]:
# https://dev.mysql.com/doc/connector-python/en/connector-python-example-connecting.html

import mysql.connector
from mysql.connector import errorcode

try:
  cnx = mysql.connector.connect(user='root',
                                password='password',
                                database='tuition')
except mysql.connector.Error as err:
  if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
    print("Something is wrong with your user name or password")
  elif err.errno == errorcode.ER_BAD_DB_ERROR:
    print("Database does not exist")
  else:
    print(err)
    
cursor = cnx.cursor()

print("DATABASES:")
cursor.execute("SHOW DATABASES")
for x in cursor:
    print(x)
    
print("\nTABLES:")    
cursor.execute("SHOW TABLES")
for x in cursor: 
    print(x)

DATABASES:
('information_schema',)
('mysql',)
('performance_schema',)
('sys',)
('tuition',)

TABLES:


In [None]:
# create_table = ("CREATE TABLE `Wiki_Edit` ("
#     " `RevisionID` INT PRIMARY KEY,"
#     " `ArticleName` VARCHAR(500),"
#     " `EditDate` DATE,"
#     " `UserName` VARCHAR(50))")

# try:
#     print(f'Creating Table "Wiki_Edit"')
#     cursor.execute(create_table)
# except mysql.connector.Error as err:
#     if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
#         print("already exists.")
#     else:
#         print(err.msg)
# else:
#     print("OK")