In [18]:
# !pip install pyodbc
# # import pyodbc
# # import csv

In [19]:
import pyodbc
import csv
import math

In [20]:


# Database connection details
SERVER = 'lds.di.unipi.it'      
DATABASE = 'Group_ID_12_DB'    
USERNAME = 'Group_ID_12'       
PASSWORD = '04AIXZEG'          
# Connection string for SQL Server
CONN_STRING = f'DRIVER={{SQL Server}};SERVER={SERVER};DATABASE={DATABASE};UID={USERNAME};PWD={PASSWORD}'




In [21]:
# # new block : get right string for table location

# prefix_string = 'MVSalvatore'
# string_SSMS = prefix_string + "." + DATABASE + " - " + USERNAME + '.'

In [22]:
def connect_to_db():
    """Establish a connection to the SQL Server database."""
    try:
        connection = pyodbc.connect(CONN_STRING)
        print("Connected to the database.")
        return connection
    except pyodbc.Error as e:
        print(f"Error connecting to database: {e}")
        return None



In [None]:
def load_csv_to_table(connection, file_path, table_name, columns, batch_size = 1000): #batch add v2
    """
    Load data from a CSV file into a SQL Server table.
    :param connection: Database connection object
    :param file_path: Path to the CSV file
    :param table_name: Target table name in SQL Server
    :param columns: List of column names in the target table
    :param batch_size: Number of rows to process per batch
    :return: Success or failure message
    """
    cursor = connection.cursor()
    success = True # modified v2
    try:
        with open(file_path, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)  # Skip header row
            rows = list(csv_reader) # v2
            total_batches = math.ceil(len(rows) / batch_size) # v2 
            print(f"Processing {len(rows)} rows from {file_path} in {total_batches} batches.") # v2

            connection.autocommit = False  # Start a transaction v2
            for batch_num in range(total_batches): #v2
                start_idx = batch_num * batch_size #v2
                end_idx = min(start_idx + batch_size, len(rows)) #v2
                batch = rows[start_idx:end_idx] # v2

                placeholders = ', '.join(['?'] * len(rows[0]))
                sql_query = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({placeholders})"
                cursor.executemany(sql_query, batch)

                print(f"Batch {batch_num + 1}/{total_batches} loaded for {file_path}.")

            connection.commit()
            print(f"Data loaded into table {table_name} successfully.")
    except Exception as e:
        connection.rollback()  # Roll back all changes for this file #v2
        print(f"Error loading file {file_path} into table {table_name}: {e}") #v2
        success = False #v2

    finally:
        cursor.close()
        return success #v2



In [None]:
# test just 1 table : full fact table at once

def main():
    # Establish connection to the database
    connection = connect_to_db()
    if not connection:
        return

    # Define mappings of CSV files to their respective tables and columns
    file_table_mapping = {
        'FactTable.csv': ('S_ClaimFactTable', ['crash_id', 'cause_id', 'person_id', 'vehicle_id', 'location_id', 'date_id', 'damage_category', 'damage']),
        'VehicleDimension.csv': ('S_VehicleDimension', ['vehicle_id', 'crash_id', 'vehicle_type', 'manufacturer', 'model', 'registration_state']),
        'CauseDimension.csv': ('S_CauseDimension', ['cause_id', 'primary_cause', 'secondary_cause', 'road_condition', 'lighting_condition', 'weather_condition', 'speed_limit', 'traffic_control_device', 'device_condition', 'alignment', 'road_defect']),
        'PersonDimension.csv': ('S_PersonDimension', ['person_id', 'age', 'gender', 'role_in_crash', 'injury_severity', 'is_under_21']),
        'GeographyDimension.csv': ('S_GeographyDimension', ['location_id', 'beat_id', 'street_name', 'street_number', 'area_risk_level']),
        'DateDimension.csv': ('S_DateDimension', ['date_id', 'month', 'year', 'quarter', 'day_of_week', 'week_number', 'is_weekend', 'is_holiday']),
        'CrashDimension.csv': ('S_CrashDimension', ['crash_id', 'crash_date', 'crash_time', 'num_units', 'crash_severity_category']) 
    }
    # Track success and failure v2
    successful_files = []
    failed_files = []

    # Load each CSV file into its respective table
    for file_path, (table_name, columns) in file_table_mapping.items():
        print(f"Loading data from {file_path} into {table_name}...")
        success = load_csv_to_table(connection, file_path, table_name, columns, batch_size=1000) #v2
        if success: #v2
            successful_files.append(file_path) #v2
        else: #v2
            failed_files.append(file_path) #v2

    # Close the database connection
    connection.close()
    print("Database population complete.")
    
    # Printing summary
    print("\nSummary of operations:")
    print("Successful file loads:")
    for file in successful_files:
        print(f" - {file}")
    print("Failed file loads:")
    for file in failed_files:
        print(f" - {file}")

if __name__ == "__main__":
    main()

Connected to the database.
Loading data from GeographyDimension.csv into GeographyDimension...
Processing 274 rows from GeographyDimension.csv in 1 batches.
Batch 1/1 loaded for GeographyDimension.csv.
Data loaded into table GeographyDimension successfully.
Database population complete.

Summary of operations:
Successful file loads:
 - GeographyDimension.csv
Failed file loads:
