In [1]:
import pandas as pd

# Load the dataset
file_path = "github_repos_cleaned.csv"
df = pd.read_csv(file_path)

# 1. Create binary columns for Cloud Platforms & Technologies
keyword_groups = {
    "AWS": ["aws", "amazon", "amazonaws", "amazone", "amazons"],
    "Azure": ["azure", "microsoft"],
    "GCP": ["gcp", "google", "googlecloud"],
    "Docker": ["docker", "dockers"],
    "Kubernetes": ["kubernetes", "k8s"],
    "Terraform": ["terraform", "terraforms"],
    "DevOps": ["devops", "cicd", "jenkins"],
}

# Convert description to lowercase to ensure case-insensitive matching
df["description"] = df["description"].astype(str).str.lower()

# Assign 1 if any keyword is found in description, else 0
for column, keywords in keyword_groups.items():
    df[column] = df["description"].apply(lambda x: 1 if any(word in x for word in keywords) else 0)

# 2. Create binary columns for selected programming languages
top_languages = [
    "JavaScript", "Python", "TypeScript", "Jupyter Notebook", "Java", "C#",
    "Go", "PHP", "C++", "Vue", "Bicep", "Kotlin", "Dart", "Rust", "C", "Ruby"
]

# Assign 1 if language matches, else 0
for lang in top_languages:
    df[lang] = (df["language"] == lang).astype(int)

# 3. Extract time-based features
df["created_at"] = pd.to_datetime(df["created_at"], format="%d-%m-%Y")
df["year"] = df["created_at"].dt.year
df["month"] = df["created_at"].dt.month
df["day"] = df["created_at"].dt.day
df["week"] = df["created_at"].dt.isocalendar().week

# 4. Add 'is_cloud_project' column
df["is_cloud_project"] = 1

# Save the updated dataset
df.to_csv("github_repos_feature_engineered.csv", index=False)

print("Feature engineering completed successfully!")


Feature engineering completed successfully!
