In [1]:
import pandas as pd
import numpy as np
import os
import random
import string
import datetime

In [2]:
# Dictionary that contains the instance of data set and number of maximum exam days allowed
days_dict = {
 'car-s-91': 18,
 'car-f-92': 16,
 'ear-f-83': 12,
 'hec-s-92': 9,
 'kfu-s-93': 10,
 'lse-f-91': 9,
 'pur-s-93': 21,
 'rye-s-93': 12,
 'sta-f-83': 7,
 'tre-s-92': 12,
 'uta-s-92': 18,
 'ute-s-92': 5,
 'yor-f-83': 11
}

In [3]:
def file_name_to_path(file_name):
    return f"../data/RAW/{file_name}"

In [4]:

def generate_random_string(total_length, num_digits=3, custom1='', custom2=''):
    """
    Generates a list of length `total_length` of random strings of format <custom1> <2-letter subject code><custom2> <numbers of `num_digits` length>
    """
    first_letter_combinations = ['MA', 'PY', 'CH', 'BI', 'EN', 'ST', 'HI', 'GO', 'CA']
    output = set()

    while len(output) < total_length:
        for letters in first_letter_combinations:
            if len(output) >= total_length:
                break
            number = ''.join(random.choices(string.digits, k=num_digits))
            new_string = custom1 + letters + custom2 + number
            output.add(new_string)
    
    return sorted(list(output)[:total_length])

In [5]:

def process_course_data(file_name):
    """
    Given an instace name of the dataset, it creates a csv file with columns : courseId	numStudents	computers	projector	whiteboard	internet	audio	printer	backup-power
    """
    global course_mapping
    global facility_list
    # Read the course data from the file
    file_path = f"{file_name_to_path(file_name)}.crs"
    df = pd.read_csv(file_path, sep=' ', header=None, names=['old_course_id', 'num_students'])

    # Rename course IDs based on subject prefixes
    
    course_mapping = generate_random_string(len(df))
    df['courseId'] = course_mapping
    
    df['numStudents'] = df['num_students']
    
    # Generate random facility columns
    facility_list = ['computers', 'projector', 'whiteboard', 'internet', 'audio', 'printer', 'backup-power']
    for facility in facility_list:
        df[facility] = [np.random.choice([0, 1], p=[0.8, 0.2]) for _ in range(len(df))]
        
    df.drop(['old_course_id', 'num_students'], inplace=True, axis=1)
    
    # Create a folder to store the output CSV
    output_folder = f"../data/{file_name}"
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the processed data to CSV
    output_file = os.path.join(output_folder, 'course_data.csv')
    df.to_csv(output_file, index=False)
    
    print(f"Processed course data saved to {output_file}")
    return df

In [6]:
process_course_data('car-f-92')
# df = pd.read_csv("car-f-92\course_data.csv")
# df.sort_values('numStudents', ascending=False)

Processed course data saved to ../data/car-f-92\course_data.csv


Unnamed: 0,courseId,numStudents,computers,projector,whiteboard,internet,audio,printer,backup-power
0,BI024,280,0,0,0,0,0,1,0
1,BI028,100,1,0,0,0,0,0,0
2,BI036,64,0,0,0,0,0,1,0
3,BI143,73,0,0,1,0,0,0,0
4,BI169,73,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
538,ST913,23,0,0,0,0,0,0,1
539,ST921,26,0,0,1,0,0,0,1
540,ST942,8,0,0,0,0,1,0,0
541,ST962,8,0,1,0,0,0,0,0


In [7]:
def process_student_data(file_name):
    file_path = f"{file_name_to_path(file_name)}.stu"
    with open(file_path, 'r') as file:
        student_courses = file.readlines()
    course_list = [student.split() for student in student_courses]
    new_students = []
    for student in course_list:
        new_course = []
        for course in student:
            course_id = course_mapping[int(course)-1]
            new_course.append(course_id)
        new_students.append(" ".join(new_course))
    
    rollnum_list = generate_random_string(len(new_students),5,'23')
    
    df =pd.DataFrame(list(zip(rollnum_list, new_students)), columns=['rollNum', 'coursesEnrolled'])

    # Create a folder to store the output CSV
    output_folder = f"../data/{file_name}"
    #os.makedirs(output_folder, exist_ok=True)
    
    # Save the processed data to CSV
    output_file = os.path.join(output_folder, 'student_data.csv')
    df.to_csv(output_file, index=False)
    
    print(f"Processed course data saved to {output_file}")
    return df
    

In [8]:
process_student_data('car-f-92')

Processed course data saved to ../data/car-f-92\student_data.csv


Unnamed: 0,rollNum,coursesEnrolled
0,23BI00136,CH844
1,23BI00145,CH626
2,23BI00168,GO729
3,23BI00199,BI179
4,23BI00238,CH611 CH626
...,...,...
18414,23ST99632,PY905 ST195 ST234
18415,23ST99830,PY038
18416,23ST99870,EN556 EN720 EN854
18417,23ST99880,EN875 EN903


# Creating Rooms using frequency distribution

In [9]:
def create_room_data(file_name):
    """
    Creates room data based on the provided course data CSV file and saves it to a CSV file.

    Parameters:
    file_name (str): Name of the folder containing the course_data.csv file.

    Returns:
    pd.DataFrame: DataFrame containing room data with columns 'Room ID', 'Capacity',
                  'computers', 'projector', 'whiteboard', 'internet', 'audio', 'printer',
                  'backup-power'.
    """
    # Read course data CSV
    course_data = pd.read_csv(f'../data/{file_name}/course_data.csv')
    data = course_data['numStudents']

    # Calculate histogram of course data
    a, b = np.histogram(data)

    # Adjust histogram data for room dataset
    num_rooms = len(data)
    room_counts = np.ceil(a * 10 / num_rooms).astype(int)  # Adjust room counts based on total data length
    capacities = (np.ceil(b / 10) * 10)[1:].astype(int)  # Adjust bin edges for room capacities

    columns = ['RoomId', 'Capacity', 'computers', 'projector', 'whiteboard', 'internet', 'audio', 'printer', 'backup-power']

    # Create an empty DataFrame with specified columns
    room_df = pd.DataFrame(columns=columns)
    room_id = 1

    # Populate room_df with rooms based on capacities and room_counts
    for i in range(len(capacities)):
        for j in range(room_counts[i]):
            # Assign facilities randomly, ensuring at least one room has all facilities
            row = {
                'RoomId': f'Room{room_id}',
                'Capacity': capacities[i],
                'computers': np.random.randint(0, 2) if j != 0 else 1,
                'projector': np.random.randint(0, 2) if j != 0 else 1,
                'whiteboard': np.random.randint(0, 2) if j != 0 else 1,
                'internet': np.random.randint(0, 2) if j != 0 else 1,
                'audio': np.random.randint(0, 2) if j != 0 else 1,
                'printer': np.random.randint(0, 2) if j != 0 else 1,
                'backup-power': np.random.randint(0, 2) if j != 0 else 1
            }
            # Append row to room_df
            room_df = pd.concat([room_df, pd.DataFrame([row])], ignore_index=True)
            room_id += 1  # Increment room ID

    
    output_folder = f"../data/{file_name}"
    #os.makedirs(output_folder, exist_ok=True)
    
    # Save the processed data to CSV
    output_file = os.path.join(output_folder, 'room_data.csv')
    room_df.to_csv(output_file, index=False)
    
    print(f"Processed room data saved to {output_file}")

    return room_df



In [10]:
# Example usage:
file_name = 'car-f-92'
create_room_data(file_name)


Processed room data saved to ../data/car-f-92\room_data.csv


Unnamed: 0,RoomId,Capacity,computers,projector,whiteboard,internet,audio,printer,backup-power
0,Room1,160,1,1,1,1,1,1,1
1,Room2,160,0,1,1,0,1,0,1
2,Room3,160,0,1,1,1,1,0,1
3,Room4,160,1,0,1,0,1,1,0
4,Room5,160,0,1,1,1,1,0,0
5,Room6,160,0,1,1,1,0,0,1
6,Room7,160,1,0,0,0,0,1,1
7,Room8,160,1,1,0,0,1,1,0
8,Room9,160,0,0,1,1,1,1,1
9,Room10,320,1,1,1,1,1,1,1


In [11]:
def create_date_dataset(file_name):
    # Convert start_date to datetime object
    start_date = pd.to_datetime('2025-03-03')
    
    # Initialize lists for dates and t-day (distance from start_date)
    dates = []
    t_day = []
    
    # Start counting days from the start_date
    current_date = start_date
    
    # Number of days for the given instance
    num_days = days_dict[file_name]
    
    for _ in range(num_days):
        # Skip Saturdays and Sundays
        while current_date.weekday() >= 5:  # 5 and 6 are Saturday and Sunday
            current_date += datetime.timedelta(days=1)
        
        # Append date and t-day (distance from start_date)
        dates.append(current_date)
        t_day.append((current_date - start_date).days)
        
        # Move to the next day and increment t-day
        current_date += datetime.timedelta(days=1)
    
    # Create DataFrame from lists
    df = pd.DataFrame({'Date': dates, 'T-Day': t_day})
    # Create a folder to store the output CSV
    output_folder = f"../data/{file_name}"
    #os.makedirs(output_folder, exist_ok=True)
    
    # Save the processed data to CSV
    output_file = os.path.join(output_folder, 'dates.csv')
    df.to_csv(output_file, index=False)
    
    print(f"Processed room data saved to {output_file}")
    
    return df


In [12]:
create_date_dataset('car-f-92')

Processed room data saved to ../data/car-f-92\dates.csv


Unnamed: 0,Date,T-Day
0,2025-03-03,0
1,2025-03-04,1
2,2025-03-05,2
3,2025-03-06,3
4,2025-03-07,4
5,2025-03-10,7
6,2025-03-11,8
7,2025-03-12,9
8,2025-03-13,10
9,2025-03-14,11


In [13]:
def create_times_dataset(file_name):
    # Define the times in 'hh:mm:ss' format
    times = ['10:00:00', '14:00:00']  # 10:00 AM and 2:00 PM in hh:mm:ss format
    
    # Create DataFrame
    df = pd.DataFrame({'Time': times})
    
    
    output_folder = f"../data/{file_name}"
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the processed data to CSV
    output_file = os.path.join(output_folder, 'times.csv')
    df.to_csv(output_file)
    
    print(f"Processed times data saved to {output_file}")
    return df
    

In [14]:
create_times_dataset("car-f-92")

Processed times data saved to ../data/car-f-92\times.csv


Unnamed: 0,Time
0,10:00:00
1,14:00:00


In [15]:
def create_room_preference(file_name):
    # Load course and room data
    course_df = pd.read_csv(f"../data/{file_name}/course_data.csv")
    room_df = pd.read_csv(f"../data/{file_name}/room_data.csv")

    # Facility check
    c_f = course_df.drop(['numStudents', 'courseId'], axis=1).to_numpy(int)
    f_r = room_df.drop(['RoomId', 'Capacity'], axis=1).to_numpy(int).T
    c_r = np.dot(c_f, f_r)
    facility_check = c_r >= c_f.sum(axis=1)[:, np.newaxis]

    # Capacity check
    num_students = course_df['numStudents'].to_numpy(int)
    capacities = room_df['Capacity'].to_numpy(int)
    capacity_check = capacities >= num_students[:, np.newaxis]

    # Feasible rooms mask
    feasible_rooms = facility_check & capacity_check

    # Initialize preference matrix with zeros
    preference_matrix = np.zeros(feasible_rooms.shape, dtype=int)

    # Define probabilities for preferences [1, 2, 3, 4, 5]
    preferences = np.array([1, 2, 3, 4, 5])
    probabilities = np.array([0.1, 0.2, 0.4, 0.2, 0.1])

    # Randomly assign preferences where feasible
    random_preferences = np.random.choice(preferences, size=feasible_rooms.shape, p=probabilities)

    # Apply the mask to set preferences where feasible_rooms is True
    preference_matrix[feasible_rooms] = random_preferences[feasible_rooms]
    

    # Create DataFrame for better readability (optional)
    preference_df = pd.DataFrame(preference_matrix, 
                                 index=course_df['courseId'], 
                                 columns=room_df['RoomId'])

    # Save the processed data to CSV
    output_folder = f"../data/{file_name}"
    output_file = f"{output_folder}/room_preference.csv"
    preference_df.to_csv(output_file)

    print(f"Processed room preference data saved to {output_file}")
    return preference_df

# Example usage
file_name = 'car-f-92'
room_preference_df = create_room_preference(file_name)

# Display the resulting DataFrame
room_preference_df


Processed room preference data saved to ../data/car-f-92/room_preference.csv


RoomId,Room1,Room2,Room3,Room4,Room5,Room6,Room7,Room8,Room9,Room10,Room11,Room12,Room13,Room14
courseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
BI024,0,0,0,0,0,0,0,0,0,3,2,2,4,2
BI028,4,0,0,2,0,0,3,3,0,3,3,3,5,4
BI036,4,0,0,3,0,0,3,3,3,4,3,2,2,3
BI143,1,4,4,4,3,3,0,0,5,3,2,3,3,4
BI169,5,0,0,3,0,0,3,3,0,3,5,1,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ST913,2,5,3,0,0,2,4,0,3,2,5,3,3,3
ST921,3,4,2,0,0,3,0,0,2,3,2,1,3,3
ST942,5,4,2,1,4,0,0,5,2,3,3,4,2,1
ST962,4,2,3,0,5,5,0,3,0,3,1,1,3,4


In [18]:
def create_datetime_preference(file_name):
    # Load dates and times datasets
    dates_df = pd.read_csv(f"../data/{file_name}/dates.csv")
    times_df = pd.read_csv(f"../data/{file_name}/times.csv")

    # Create datetime combinations
    date_times = []
    for date in dates_df['Date']:
        for time in times_df['Time']:
            date_times.append(f"{date} {time}")

    # Create a preference matrix with zeros
    num_courses = len(course_mapping)
    num_datetimes = len(date_times)
    preference_matrix = np.zeros((num_courses, num_datetimes), dtype=int)

    # Define probabilities for preferences [1, 2, 3, 4, 5]
    preferences = np.array([1, 2, 3, 4, 5])
    probabilities = np.array([0.1, 0.2, 0.4, 0.2, 0.1])

    # Randomly assign preferences to each course-time combination
    random_preferences = np.random.choice(preferences, size=(num_courses, num_datetimes), p=probabilities)

    # Create DataFrame for better readability (optional)
    preference_df = pd.DataFrame(random_preferences, 
                                 index=[course_mapping], 
                                 columns=date_times)

    # Save the processed data to CSV
    output_folder = f"../data/{file_name}"
    output_file = f"{output_folder}/datetime_preference.csv"
    preference_df.to_csv(output_file, index=True)

    print(f"Processed datetime preference data saved to {output_file}")
    return preference_df

# Example usage
file_name = 'car-f-92'
datetime_preference_df = create_datetime_preference(file_name)

# Display the resulting DataFrame
datetime_preference_df


Processed datetime preference data saved to ../data/car-f-92/datetime_preference.csv


Unnamed: 0,2025-03-03 10:00:00,2025-03-03 14:00:00,2025-03-04 10:00:00,2025-03-04 14:00:00,2025-03-05 10:00:00,2025-03-05 14:00:00,2025-03-06 10:00:00,2025-03-06 14:00:00,2025-03-07 10:00:00,2025-03-07 14:00:00,...,2025-03-18 10:00:00,2025-03-18 14:00:00,2025-03-19 10:00:00,2025-03-19 14:00:00,2025-03-20 10:00:00,2025-03-20 14:00:00,2025-03-21 10:00:00,2025-03-21 14:00:00,2025-03-24 10:00:00,2025-03-24 14:00:00
BI024,5,3,5,2,1,3,3,4,3,4,...,2,3,2,3,3,1,3,2,3,5
BI028,5,5,2,3,3,3,3,4,3,4,...,3,3,5,4,3,2,5,2,3,5
BI036,4,3,4,3,5,4,4,4,5,3,...,5,4,4,4,2,3,2,3,5,3
BI143,2,3,4,1,5,3,5,2,5,3,...,3,2,1,4,2,2,3,1,3,4
BI169,4,5,2,3,3,2,4,3,2,3,...,3,4,1,1,4,4,4,3,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ST913,1,4,3,4,1,3,1,3,5,4,...,2,3,3,1,5,5,3,2,1,1
ST921,3,3,3,3,3,3,3,2,3,3,...,3,4,2,2,4,2,3,1,2,3
ST942,3,3,2,3,3,5,4,3,4,2,...,5,4,3,5,3,2,3,4,4,4
ST962,2,2,1,3,4,1,2,3,2,4,...,1,4,2,3,3,2,2,3,3,3
