In [1]:
pip install requests beautifulsoup4 pandas scikit-learn numpy

Collecting requests
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting beautifulsoup4
  Using cached beautifulsoup4-4.14.3-py3-none-any.whl.metadata (3.8 kB)
Collecting pandas
  Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting numpy
  Downloading numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting charset_normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.4.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (37 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.6.2-py3-none-any.whl.metadata (6.6 kB)
Collecting certifi>=2017.4.17 (from req

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import csv
import io
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# --------------------------------------------------
# CONFIG
# --------------------------------------------------
URL = "https://raw.githubusercontent.com/arunaravikasturi/Potsdam-University/main/AIBAS/Readme.md"

OUTPUT_JOINT = "joint_data_collection.csv"
OUTPUT_TRAIN = "training_data.csv"
OUTPUT_TEST = "test_data.csv"
OUTPUT_ACTIVATION = "activation_data.csv"

# --------------------------------------------------
# STEP 1: SCRAPE RAW TEXT FROM GITHUB
# --------------------------------------------------
response = requests.get(URL)
response.raise_for_status()

raw_text = response.text.strip()

# --------------------------------------------------
# STEP 2: PARSE CSV SAFELY (handles commas in quotes)
# --------------------------------------------------
csv_buffer = io.StringIO(raw_text)
reader = csv.reader(csv_buffer)

rows = list(reader)
header = rows[0]
data = rows[1:]

df = pd.DataFrame(data, columns=header)

# --------------------------------------------------
# STEP 3: DATA TYPE CONVERSION
# --------------------------------------------------
df["salary_usd"] = pd.to_numeric(df["salary_usd"], errors="coerce")

df["company_size"] = df["company_size"].map({
    "Small": 1,
    "Medium": 2,
    "Large": 3
})

df["remote_option"] = df["remote_option"].map({
    "No": 0,
    "Yes": 1
})

df.dropna(subset=["salary_usd", "company_size", "remote_option"], inplace=True)

# --------------------------------------------------
# STEP 4: ALGORITHMIC OUTLIER REMOVAL (IQR)
# --------------------------------------------------
Q1 = df["salary_usd"].quantile(0.25)
Q3 = df["salary_usd"].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df = df[(df["salary_usd"] >= lower) & (df["salary_usd"] <= upper)]

# --------------------------------------------------
# STEP 5: ALGORITHMIC NORMALIZATION
# --------------------------------------------------
scaler = MinMaxScaler()
df["salary_usd_normalized"] = scaler.fit_transform(df[["salary_usd"]])

# --------------------------------------------------
# STEP 6: SAVE JOINT DATASET
# --------------------------------------------------
df.to_csv(OUTPUT_JOINT, index=False)

# --------------------------------------------------
# STEP 7: TRAIN / TEST SPLIT (80 / 20)
# --------------------------------------------------
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.to_csv(OUTPUT_TRAIN, index=False)
test_df.to_csv(OUTPUT_TEST, index=False)

# --------------------------------------------------
# STEP 8: ACTIVATION DATA (ONE ROW)
# --------------------------------------------------
activation_df = test_df.sample(n=1, random_state=42)
activation_df.to_csv(OUTPUT_ACTIVATION, index=False)

print("✅ Data scraping and preparation completed successfully")
print("Generated files:")
print("-", OUTPUT_JOINT)
print("-", OUTPUT_TRAIN)
print("-", OUTPUT_TEST)
print("-", OUTPUT_ACTIVATION)


✅ Data scraping and preparation completed successfully
Generated files:
- joint_data_collection.csv
- training_data.csv
- test_data.csv
- activation_data.csv
