## Read Data

In [14]:
import pandas as pd 
import glob

path = './data/*.xls*'

files = glob.glob(path)

dfs = []
bad_col = 'Name of institutions'
good_col = 'Name of institution'
for file in files: 
    df = pd.read_excel(file, sheet_name='Tuition')
    if bad_col in df.columns:
        df.rename(columns={bad_col: good_col}, inplace=True)
    dfs.append(df)
    
df = pd.concat(dfs, ignore_index=True)

df = df.groupby(good_col).agg(lambda x: ', '.join(x.dropna().astype(str)))

df.drop(columns=['UnitID', 'OPEID'], inplace=True)

df

Unnamed: 0_level_0,Sector,Sector name,State,2010-11 Tuition and fees,List A: High tuition and fee indicator,List E: Low tuition and fee indicator,2011-12 Tuition and fees,2012-13 Tuition and fees,2013-14 Tuition and fees,2014-15 Tuition and fees
Name of institution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AI Miami International University of Art and Design,"3, 3, 3, 3, 3","4-year, private for-profit, 4-year, private fo...","FL, FL, FL, FL, FL",17704.0,"0, 0, 0, 0, 0","0, 0, 0, 0, 0",17704.0,17714.0,17704.0,17704.0
AIB College of Business,"2, 2, 2, 2, 2","4-year, private not-for-profit, 4-year, privat...","IA, IA, IA, IA, IA",13140.0,"0, 0, 0, 0, 0","0, 0, 0, 0, 0",13767.0,14067.0,14913.0,15666.0
AM College LLC,3,"4-year, private for-profit",FL,,0,1,,,,10425.0
ASA College,"6, 6, 6","2-year, private for-profit, 2-year, private fo...","NY, NY, NY",,"0, 0, 0","0, 0, 0",,12094.0,12268.0,12298.0
ASA Institute of Business and Computer Technology,"6, 6","2-year, private for-profit, 2-year, private fo...","NY, NY",12094.0,"0, 0","0, 0",12094.0,,,
...,...,...,...,...,...,...,...,...,...,...
Youngstown State University,"1, 1, 1, 1, 1","4-year, public, 4-year, public, 4-year, public...","OH, OH, OH, OH, OH",7199.0,"0, 0, 0, 0, 0","0, 0, 0, 0, 0",7451.0,7712.0,8129.0,8317.0
Yuba College,"4, 4, 4, 4, 4","2-year, public, 2-year, public, 2-year, public...","CA, CA, CA, CA, CA",752.0,"0, 0, 0, 0, 0","1, 1, 1, 1, 1",976.0,1124.0,1144.0,1144.0
Zane State College,"4, 4, 4, 4, 4","2-year, public, 2-year, public, 2-year, public...","OH, OH, OH, OH, OH",4130.0,"0, 0, 0, 0, 0","0, 0, 0, 0, 0",4290.0,4448.0,4556.0,4646.0
Zion Bible College,"2, 2","4-year, private not-for-profit, 4-year, privat...","MA, MA",9225.0,"0, 0","0, 0",9980.0,,,


## Clean Data

## Prediction Analysis

In [1]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

from sklearn.linear_model import LinearRegression

## Add Data to DB

### Postgres

In [5]:
import psycopg2

# Define your connection parameters
db_host = 'localhost'
db_port = '5432'
db_name = 'tuition'
db_user = 'my_user'
db_password = 'password'

# Establish a connection to the PostgreSQL database
try:
    conn = psycopg2.connect(
        host=db_host,
        port=db_port,
        database=db_name,
        user=db_user,
        password=db_password
    )
    print("Connected to the database")
    
    cursor = conn.cursor()
    
    cursor.execute("SELECT version();")
    
    db_version = cursor.fetchone()
    print("PostgreSQL database version:", db_version)
    
except psycopg2.Error as e:
    print("Error connecting to PostgreSQL:", e) 
    
cursor.close()
conn.close()

Connected to the database
PostgreSQL database version: ('PostgreSQL 16.2 (Debian 16.2-1.pgdg120+2) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit',)


### Mysql

In [7]:
# https://dev.mysql.com/doc/connector-python/en/connector-python-example-connecting.html

import mysql.connector
from mysql.connector import errorcode

try:
  cnx = mysql.connector.connect(user='root',
                                password='password',
                                database='tuition')
except mysql.connector.Error as err:
  if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
    print("Something is wrong with your user name or password")
  elif err.errno == errorcode.ER_BAD_DB_ERROR:
    print("Database does not exist")
  else:
    print(err)
    
cursor = cnx.cursor()

print("DATABASES:")
cursor.execute("SHOW DATABASES")
for x in cursor:
    print(x)
    
print("\nTABLES:")    
cursor.execute("SHOW TABLES")
for x in cursor: 
    print(x)

DATABASES:
('information_schema',)
('mysql',)
('performance_schema',)
('sys',)
('tuition',)

TABLES:


In [None]:
# create_table = ("CREATE TABLE `Wiki_Edit` ("
#     " `RevisionID` INT PRIMARY KEY,"
#     " `ArticleName` VARCHAR(500),"
#     " `EditDate` DATE,"
#     " `UserName` VARCHAR(50))")

# try:
#     print(f'Creating Table "Wiki_Edit"')
#     cursor.execute(create_table)
# except mysql.connector.Error as err:
#     if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
#         print("already exists.")
#     else:
#         print(err.msg)
# else:
#     print("OK")