In [None]:
!pip install scikit-learn 

ModuleNotFoundError: No module named 'sklearn'

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import os

# Load the raw data
df = pd.read_csv("../data/raw/github_trending_repos.csv")

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
# Define target variable
target = 'stars_period'
y = df[target]

# Features to keep and drop
relevant_features = ['stars', 'forks', 'contributors_count', 'search_language', 'timeframe', 'language']
dropped_features = ['owner', 'repo_name', 'description', 'url', 'scraped_at', 'stars_period', 'full_name']

print(f"Target variable: {target}")
print(f"Target statistics:\n{y.describe()}")
print(f"\nRelevant features: {relevant_features}")

0    1067
1     118
2     226
3     250
4      67
Name: stars_period, dtype: int64

In [None]:
# Create a copy for feature engineering
df_features = df[relevant_features].copy()

# Check for missing values
print("Missing values:")
print(df_features.isnull().sum())
print(f"\nTotal missing: {df_features.isnull().sum().sum()}")

# Handle missing values if any
if df_features.isnull().sum().sum() > 0:
    df_features = df_features.fillna(df_features.mode().iloc[0])

df_features.head()


In [None]:
# Create new features that might help prediction

# 1. Stars to Forks ratio (engagement metric)
df_features['stars_forks_ratio'] = df_features['stars'] / (df_features['forks'] + 1)  # +1 to avoid division by zero

# 2. Log transforms for skewed numerical features (stars, forks)
df_features['log_stars'] = np.log1p(df_features['stars'])  # log1p handles zeros
df_features['log_forks'] = np.log1p(df_features['forks'])

# 3. Stars per contributor (if contributors_count > 0)
df_features['stars_per_contributor'] = np.where(
    df_features['contributors_count'] > 0,
    df_features['stars'] / df_features['contributors_count'],
    0
)

# 4. Popularity score (normalized combination of stars and forks)
# Using min-max normalization for this feature
stars_min, stars_max = df_features['stars'].min(), df_features['stars'].max()
forks_min, forks_max = df_features['forks'].min(), df_features['forks'].max()
df_features['normalized_stars'] = (df_features['stars'] - stars_min) / (stars_max - stars_min + 1e-8)
df_features['normalized_forks'] = (df_features['forks'] - forks_min) / (forks_max - forks_min + 1e-8)
df_features['popularity_score'] = (df_features['normalized_stars'] + df_features['normalized_forks']) / 2

# 5. Binary features
df_features['has_contributors'] = (df_features['contributors_count'] > 0).astype(int)
df_features['high_stars'] = (df_features['stars'] > df_features['stars'].median()).astype(int)
df_features['high_forks'] = (df_features['forks'] > df_features['forks'].median()).astype(int)

print("New features created:")
print(df_features.columns.tolist())
print(f"\nTotal features: {len(df_features.columns)}")
df_features.head()


In [None]:
# Encode categorical variables
# Using Label Encoding for categorical features (can also use One-Hot Encoding)

# Create copies for encoding
df_encoded = df_features.copy()

# Label encode categorical variables
label_encoders = {}
categorical_cols = ['search_language', 'timeframe', 'language']

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col + '_encoded'] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le
    print(f"{col}: {len(le.classes_)} unique values")

# Drop original categorical columns (keep encoded versions)
df_encoded = df_encoded.drop(columns=categorical_cols)

print(f"\nAfter encoding - Shape: {df_encoded.shape}")
print(f"Features: {df_encoded.columns.tolist()}")
df_encoded.head()


In [None]:
# Identify numerical features for normalization
numerical_features = ['stars', 'forks', 'contributors_count', 
                      'stars_forks_ratio', 'log_stars', 'log_forks',
                      'stars_per_contributor', 'normalized_stars', 
                      'normalized_forks', 'popularity_score']

# Keep only features that exist in the dataframe
numerical_features = [f for f in numerical_features if f in df_encoded.columns]

print("Numerical features to normalize:")
print(numerical_features)

# Check feature distributions
print("\nFeature statistics before normalization:")
print(df_encoded[numerical_features].describe())


In [None]:
# Normalize numerical features using StandardScaler
scaler = StandardScaler()

# Create a copy for normalized features
df_normalized = df_encoded.copy()

# Fit and transform numerical features
df_normalized[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])

print("Feature statistics after normalization:")
print(df_normalized[numerical_features].describe())
print("\nNormalized features mean (should be ~0):", df_normalized[numerical_features].mean().round(4).tolist())
print("Normalized features std (should be ~1):", df_normalized[numerical_features].std().round(4).tolist())


In [None]:
# Analyze feature importance/correlation with target
# This helps identify which features are most useful for prediction

# Add target back for correlation analysis
df_analysis = df_normalized.copy()
df_analysis['target'] = y

# Calculate correlations with target
correlations = df_analysis.corr()['target'].sort_values(ascending=False)
print("Feature correlations with target (stars_period):")
print(correlations.drop('target'))

# Visualize top correlations
print("\nTop 10 features most correlated with target:")
print(correlations.drop('target').head(10))


In [None]:
# Prepare final dataset for saving
# Remove the target from features (it was only added for analysis)
df_final = df_normalized.copy()

print("Final feature engineering summary:")
print(f"Original features: {len(relevant_features)}")
print(f"Final features: {len(df_final.columns)}")
print(f"\nFinal feature list:")
for i, col in enumerate(df_final.columns, 1):
    print(f"{i}. {col}")

print(f"\nDataset shape: {df_final.shape}")
df_final.head()


In [None]:
# Save processed dataset
# Create processed directory if it doesn't exist
os.makedirs("../data/processed", exist_ok=True)

# Save features and target separately (common practice for ML)
df_final.to_csv("../data/processed/features.csv", index=False)
y.to_csv("../data/processed/target.csv", index=False)

# Also save a combined version for convenience
df_combined = df_final.copy()
df_combined['stars_period'] = y
df_combined.to_csv("../data/processed/processed_dataset.csv", index=False)

print("✅ Processed datasets saved successfully!")
print("Files saved:")
print("  - ../data/processed/features.csv")
print("  - ../data/processed/target.csv")
print("  - ../data/processed/processed_dataset.csv")


In [None]:
# Optional: Create a summary of feature engineering steps
feature_summary = {
    'original_features': relevant_features,
    'new_features_created': [
        'stars_forks_ratio',
        'log_stars',
        'log_forks',
        'stars_per_contributor',
        'normalized_stars',
        'normalized_forks',
        'popularity_score',
        'has_contributors',
        'high_stars',
        'high_forks'
    ],
    'categorical_encoded': categorical_cols,
    'normalized_features': numerical_features,
    'total_features': len(df_final.columns),
    'target_variable': target
}

print("Feature Engineering Summary:")
print("=" * 50)
for key, value in feature_summary.items():
    if isinstance(value, list):
        print(f"\n{key}:")
        for item in value:
            print(f"  - {item}")
    else:
        print(f"{key}: {value}")

print("\n" + "=" * 50)
print("Feature engineering complete! ✅")
