---
#### Creating a script that will increase the side of my dataset. The original size of the dataset was 520KB and was increased to 484.2MB.
---

In [2]:
# Importing Libraries
import pandas as pd
import time
import numpy as np
from datetime import datetime

In [3]:
# Reading in the dataset
df = pd.read_csv('dataset.csv')

# Display the first few rows
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nationality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 35 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Nationality                                     4424 non-null   int64  
 7   Mother's qualification                          4424 non-null   int64  
 8   Father's qualification                          4424 non-null   int64  
 9   Mother's occupation                      

In [5]:
start_time = time.time() # By timing this function - we can evaulate it's performance

def increasing_student(df, target_size):

    original_size = len(df)
    additional_copies_needed = target_size // original_size - 1

    # Creating a list to hold the original dataset and any copies required
    dfs = [df]
    
    for _ in range(additional_copies_needed):
        dfs.append(df.copy())
    
    # Concatenate all DataFrames in the list to reach the target size
    grown_df = pd.concat(dfs, ignore_index=True)

    #Adding extra rows needed to match target size
    extra_rows_needed = target_size - len(grown_df)
    if extra_rows_needed > 0:
        grown_df = pd.concat([grown_df, df.iloc[:extra_rows_needed]], ignore_index=True)
    
    return grown_df

# Specifing the amount of rows needed
target_size = 250_000

# Growing the dataset in size
grown_df = increasing_student(df, target_size)

# Stopping the timer
end_time = time.time()

df = grown_df.copy()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 35 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Marital status                                  250000 non-null  int64  
 1   Application mode                                250000 non-null  int64  
 2   Application order                               250000 non-null  int64  
 3   Course                                          250000 non-null  int64  
 4   Daytime/evening attendance                      250000 non-null  int64  
 5   Previous qualification                          250000 non-null  int64  
 6   Nationality                                     250000 non-null  int64  
 7   Mother's qualification                          250000 non-null  int64  
 8   Father's qualification                          250000 non-null  int64  
 9   Mother's occupation       

In [6]:
# Calculating running time
execution_time = end_time - start_time 

# Converting time to mins and seconds
minutes = int(execution_time // 60)
seconds = int(execution_time % 60)

print(f"Running time to create increased csv: {minutes} minutes {seconds} seconds")

Running time to create increased csv: 0 minutes 0 seconds


In [7]:
# Starting the time to evaulate performance
start_time = time.time()

# Printing increased csv file
grown_df.to_csv('student_increased.csv', index= False)

# Stopping the clock
end_time = time.time()

In [8]:
# Calculating the running time
execution_time = end_time - start_time

# Convert execution time to mins and seconds
minutes = int(execution_time // 60)
seconds = int(execution_time % 60)

print(f"Running time in printing out student_increased.csv: {minutes} minutes and {seconds} seconds")

Running time in printing out student_increased.csv: 0 minutes and 3 seconds


In [9]:
# Obtaining current Date and Time
current_time = datetime.now()

# Formatting the date and time into a readable format
formatted_time = current_time.strftime('%B %d, %Y, %H:%M:%S')

# Printing the formatted date and time
print(f"Increasing the dataset was last run on the: {formatted_time}")

Increasing the dataset was last run on the: March 15, 2025, 22:42:28
