In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
import os
import matplotlib.pyplot as plt
import matplotlib
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Base path for project
base_path = r"/Users/ilukerogers/Desktop/computerScience/MATH354/project/"

# Load all datasets
postings = pd.read_csv(os.path.join(base_path, "postings.csv"))
benefits = pd.read_csv(os.path.join(base_path, "benefits.csv"))
job_industries = pd.read_csv(os.path.join(base_path, "job_industries.csv"))
job_skills = pd.read_csv(os.path.join(base_path, "job_skills.csv"))
companies = pd.read_csv(os.path.join(base_path, "companies.csv"))
employee_counts = pd.read_csv(os.path.join(base_path, "employee_counts.csv"))
company_industries = pd.read_csv(os.path.join(base_path, "company_industries.csv"))
company_specialities = pd.read_csv(os.path.join(base_path, "company_specialities.csv"))
industries = pd.read_csv(os.path.join(base_path, "industries.csv"))
salaries = pd.read_csv(os.path.join(base_path, "salaries.csv"))
skills = pd.read_csv(os.path.join(base_path, "skills.csv"))

# Check the number of unique values in columns for each dataset
unique_values = {
    'postings': postings['job_id'].nunique(),
    'benefits': benefits['job_id'].nunique(),
    'job_industries': job_industries['job_id'].nunique(),
    'job_skills': job_skills['job_id'].nunique(),
    'companies': companies['company_id'].nunique(),
    'employee_counts': employee_counts['company_id'].nunique(),
    'company_industries': company_industries['company_id'].nunique(),
    'company_specialities': company_specialities['company_id'].nunique()
}

unique_values
print(unique_values)

# Merge job related datasets with one to one relationships
merged_jobs = pd.merge(postings, benefits, on='job_id', how='left')

# Merge company related datasets with one to one relationships
merged_companies = pd.merge(companies, employee_counts, on='company_id', how='left')

# Merge
comprehensive_data_one_to_one = pd.merge(merged_jobs, merged_companies, on='company_id', how='left')
print(comprehensive_data_one_to_one.head())

# Identify missing data
missing_data = comprehensive_data_one_to_one.isnull().sum()

# Display columns with missing data
significant_missing_columns = missing_data[missing_data > 0].sort_values(ascending=False)

print(significant_missing_columns)

# Handling missing values

# Use "Not Specified" for categorical columns with missing values
cols_fill_not_specified = [
    'skills_desc', 'type', 'pay_period', 'currency', 'compensation_type', 'posting_domain',
    'application_url', 'formatted_experience_level', 'company_size', 'zip_code', 'address',
    'state', 'url', 'city', 'country', 'name'
]
for col in cols_fill_not_specified:
    if col in comprehensive_data_one_to_one.columns:
        comprehensive_data_one_to_one[col].fillna("Not Specified", inplace=True)

# Fill numerical columns with zeros
cols_fill_zero = ['applies', 'views', 'follower_count', 'employee_count']
for col in cols_fill_zero:
    if col in comprehensive_data_one_to_one.columns:
        comprehensive_data_one_to_one[col].fillna(0, inplace=True)

# Fill 'remote_allowed' with "Unknown"
if 'remote_allowed' in comprehensive_data_one_to_one.columns:
    comprehensive_data_one_to_one['remote_allowed'].fillna("Unknown", inplace=True)

# For salary and other complex columns, leave NaNs for specific handling later

# Check remaining missing values
remaining_missing = comprehensive_data_one_to_one.isnull().sum()
remaining_missing_cols = remaining_missing[remaining_missing > 0].sort_values(ascending=False)

print(remaining_missing_cols)

# Handle even more missing values

# Use "Still Open" for closed_time
if 'closed_time' in comprehensive_data_one_to_one.columns:
    comprehensive_data_one_to_one['closed_time'].fillna("Still Open", inplace=True)

# Use "Unknown" for inferred
if 'inferred' in comprehensive_data_one_to_one.columns:
    comprehensive_data_one_to_one['inferred'].fillna("Unknown", inplace=True)

# Use "Not Specified" for company description
if 'description_y' in comprehensive_data_one_to_one.columns:
    comprehensive_data_one_to_one['description_y'].fillna("Not Specified", inplace=True)

# Check for remaining missing values
remaining_missing = comprehensive_data_one_to_one.isnull().sum()
remaining_missing_cols = remaining_missing[remaining_missing > 0].sort_values(ascending=False)

print(remaining_missing_cols)

# Fill missing job descriptions
comprehensive_data_one_to_one['description_x'].fillna("Not Specified", inplace=True)

# Remove duplicate rows
comprehensive_data_cleaned = comprehensive_data_one_to_one.drop_duplicates()

# Shape of the cleaned data
print(comprehensive_data_cleaned.shape)

# Most in-demand skills across job postings using the correct column
top_skills = job_skills['skill_abr'].value_counts().head(10)

# Most in-demand skills across job postings using the correct column
top_skills = job_skills['skill_abr'].value_counts().head(10)

# Set Matplotlib backend to 'TkAgg' for proper plot rendering in PyCharm
matplotlib.use('TkAgg')

# Most in-demand skills across job postings using the correct column
top_skills = job_skills['skill_abr'].value_counts().head(10)

# Plotting
plt.figure(figsize=(12, 7))
top_skills.plot(kind='bar', color='lightcoral')
plt.title('Top 10 Most In-Demand Skills Across Job Postings')
plt.xlabel('Skill Abbreviation')
plt.ylabel('Number of Mentions in Job Postings')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("breakpoint")

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
import yfinance as yf
import warnings
from datetime import datetime

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')


# Load the pre-processed job market data
# Assuming comprehensive_data_cleaned is already available from your data processing

# Enhanced Feature Engineering with more detailed output
def prepare_stock_features(job_data, skills_data, company_industries_data):
    """
    Prepare features for stock prediction with detailed progress reporting
    """
    print("\n=== Starting Feature Engineering ===")

    # 1. Company-level features
    print("Calculating company-level features...")
    company_features = job_data.groupby('company_id').agg({
        'applies': ['sum', 'mean', 'std'],
        'views': ['sum', 'mean', 'std'],
        'remote_allowed': lambda x: (x == 't').mean(),
        'employee_count': ['mean', 'max'],
        'follower_count': ['mean', 'max']
    })
    company_features.columns = ['_'.join(col).strip() for col in company_features.columns.values]
    print(f"Created {len(company_features.columns)} company-level features")

    # 2. Add industry information
    print("\nAdding industry information...")
    if 'industry' in company_industries_data.columns:
        company_industries = company_industries_data.groupby('company_id')['industry'].nunique().reset_index()
        company_industries.columns = ['company_id', 'industry_count']
        company_features = company_features.merge(company_industries, on='company_id', how='left')
        print("Added industry count features")
    else:
        print("Warning: Industry data not available - skipping industry features")

    # 3. Add skills demand
    print("\nAdding skills information...")
    company_skills = skills_data.groupby('company_id')['skill_abr'].nunique().reset_index()
    company_skills.columns = ['company_id', 'unique_skills_count']
    company_features = company_features.merge(company_skills, on='company_id', how='left')
    print("Added skills count features")

    # Fill any remaining NA values
    print("\nHandling missing values...")
    initial_missing = company_features.isna().sum().sum()
    company_features.fillna(0, inplace=True)
    final_missing = company_features.isna().sum().sum()
    print(f"Filled {initial_missing} missing values (remaining: {final_missing})")

    print("\n=== Feature Engineering Complete ===")
    print(f"Final feature matrix shape: {company_features.shape}")
    print("Feature columns:", list(company_features.columns))

    return company_features


# Enhanced stock data fetching with more detailed output
def get_stock_data(company_ids, companies_df, lookahead_days=30):
    """
    Fetch stock data with detailed progress reporting
    """
    print("\n=== Fetching Stock Data ===")

    # Sample ticker mapping (replace with your actual mapping)
    sample_tickers = {
        'amazon': 'AMZN',
        'microsoft': 'MSFT',
        'google': 'GOOGL',
        'apple': 'AAPL',
        'facebook': 'META',
        'tesla': 'TSLA',
        'netflix': 'NFLX',
        'oracle': 'ORCL',
        'ibm': 'IBM',
        'intel': 'INTC'
    }

    stock_data = []
    success_count = 0
    fail_count = 0

    print(f"Attempting to fetch data for {len(company_ids)} companies...")

    for i, company_id in enumerate(company_ids[:20]):  # Limit to first 20 for demo
        # Get company name
        company_info = companies_df[companies_df['company_id'] == company_id]
        if len(company_info) == 0:
            fail_count += 1
            continue

        company_name = company_info['name'].values[0]

        # Find matching ticker
        ticker = None
        for name_part in str(company_name).lower().split():
            if name_part in sample_tickers:
                ticker = sample_tickers[name_part]
                break

        if not ticker:
            fail_count += 1
            continue

        try:
            # Get historical data
            print(f"\nFetching data for {company_name} ({ticker})...")
            stock = yf.Ticker(ticker)
            hist = stock.history(period="1y")

            if len(hist) < 30:
                print(f"Insufficient data for {ticker} - skipping")
                fail_count += 1
                continue

            # Calculate future returns
            hist['future_close'] = hist['Close'].shift(-lookahead_days)
            hist['future_return'] = (hist['future_close'] - hist['Close']) / hist['Close']

            # Add company info
            hist['company_id'] = company_id
            hist['company_name'] = company_name
            hist['ticker'] = ticker

            # Keep recent data
            recent_data = hist.iloc[-60:-lookahead_days]
            stock_data.append(recent_data)
            success_count += 1

            print(f"Successfully retrieved {len(recent_data)} days of data")

        except Exception as e:
            print(f"Failed to get data for {company_name}: {str(e)}")
            fail_count += 1
            continue

    print("\n=== Stock Data Fetching Complete ===")
    print(f"Successfully retrieved data for {success_count} companies")
    print(f"Failed to retrieve data for {fail_count} companies")

    if not stock_data:
        print("\nWarning: No stock data retrieved. Using sample data for demonstration.")
        sample_company = companies_df.iloc[0]['company_id']
        sample_name = companies_df.iloc[0]['name']
        dates = pd.date_range(end=datetime.today(), periods=60)
        sample_df = pd.DataFrame({
            'Date': dates,
            'Close': np.random.normal(100, 10, 60),
            'company_id': sample_company,
            'company_name': sample_name,
            'ticker': 'SAMPLE'
        })
        sample_df['future_close'] = sample_df['Close'].shift(-30)
        sample_df['future_return'] = (sample_df['future_close'] - sample_df['Close']) / sample_df['Close']
        sample_df.set_index('Date', inplace=True)
        stock_data.append(sample_df.dropna())

    return pd.concat(stock_data)


# Enhanced Model Training with detailed output
def train_and_evaluate_model(X, y):
    """
    Train and evaluate model with detailed reporting
    """
    print("\n=== Model Training ===")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    print(f"Train set size: {X_train.shape[0]} samples")
    print(f"Test set size: {X_test.shape[0]} samples")

    # Build model pipeline
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', RandomForestRegressor(
            n_estimators=100,
            random_state=42,
            verbose=1
        ))
    ])

    # Train model
    print("\nTraining model...")
    model.fit(X_train, y_train)
    print("Training complete")

    # Evaluate
    print("\n=== Model Evaluation ===")
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Calculate metrics
    metrics = {
        'Train MSE': mean_squared_error(y_train, y_pred_train),
        'Test MSE': mean_squared_error(y_test, y_pred_test),
        'Train MAE': mean_absolute_error(y_train, y_pred_train),
        'Test MAE': mean_absolute_error(y_test, y_pred_test),
        'Train R2': r2_score(y_train, y_pred_train),
        'Test R2': r2_score(y_test, y_pred_test)
    }

    # Print metrics
    print("\nModel Performance Metrics:")
    for name, value in metrics.items():
        print(f"{name}: {value:.4f}")

    return model, metrics


# Main execution flow with enhanced output
print("=== Starting Stock Market Predictor ===")

# Prepare features
try:
    company_features = prepare_stock_features(
        comprehensive_data_cleaned,
        job_skills,
        company_industries
    )
except Exception as e:
    print(f"\nError in feature preparation: {str(e)}")
    print("Using fallback feature engineering...")
    company_features = comprehensive_data_cleaned.groupby('company_id').agg({
        'applies': ['sum', 'mean'],
        'views': ['sum', 'mean'],
        'remote_allowed': lambda x: (x == 't').mean(),
        'employee_count': 'mean',
        'follower_count': 'mean'
    })
    company_features.columns = ['_'.join(col).strip() for col in company_features.columns.values]
    company_features.fillna(0, inplace=True)

# Get stock data
company_ids_with_features = company_features.index.unique()
stock_data = get_stock_data(company_ids_with_features, companies)

# Merge features with stock data
merged_data = stock_data.reset_index().merge(
    company_features,
    on='company_id',
    how='left'
)

# Prepare features and target
feature_cols = [col for col in company_features.columns if col != 'company_id']
X = merged_data[feature_cols]
y = merged_data['future_return']

print("\n=== Final Data Summary ===")
print(f"Total samples available: {len(X)}")
print(f"Number of features: {len(feature_cols)}")
print(f"Target variable stats: mean={y.mean():.4f}, std={y.std():.4f}")

# Train and evaluate model
model, metrics = train_and_evaluate_model(X, y)

# Feature importance analysis
print("\n=== Feature Importance Analysis ===")

# Get feature importances
importances = model.named_steps['regressor'].feature_importances_
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(
    x='Importance',
    y='Feature',
    data=feature_importance.head(20),
    palette='viridis'
)
plt.title('Top 7 Most Important Features for Stock Return Prediction')
plt.tight_layout()
plt.show()

# Detailed predictions for top companies
print("\n=== Detailed Predictions ===")

# Get top 5 companies with most data points
top_companies = merged_data['company_name'].value_counts().head(5).index

for company in top_companies:
    company_data = merged_data[merged_data['company_name'] == company]
    if len(company_data) == 0:
        continue

    # Get latest data point
    latest = company_data.iloc[-1]
    features = latest[feature_cols].values.reshape(1, -1)
    prediction = model.predict(features)[0]
    actual = latest['future_return']
    ticker = latest.get('ticker', 'UNKNOWN')

    print(f"\nCompany: {company} ({ticker})")
    print(f"Predicted 30-day return: {prediction * 100:.2f}%")
    if not np.isnan(actual):
        print(f"Actual 30-day return: {actual * 100:.2f}%")
        print(f"Prediction error: {(prediction - actual) * 100:.2f}%")
    print("\nKey Features:")
    for feat in feature_importance['Feature'].head(3):
        print(f"{feat}: {latest[feat]:.2f}")

print("\n=== Stock Market Predictor Complete ===")


# Create classification target: Low, Medium, High return classes
merged_data['return_class'] = pd.qcut(
    merged_data['future_return'], q=3, labels=['Low', 'Medium', 'High']
)

#replace qcut with this to get the same classes as in the report

#bins = [-float('inf'), -0.01, 0.01, float('inf')]
#labels = ['-1', '0', '1']  # or ['Decrease', 'Neutral', 'Increase'] if preferred

# Create classification target using custom thresholds
#merged_data['return_class'] = pd.cut(merged_data['future_return'], bins=bins, labels=labels)





from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

print("\n=== Classification Model Training ===")



# Prepare input features and target
X_class = merged_data[feature_cols]
y_class = merged_data['return_class']

# Drop NA targets if any
X_class = X_class[~y_class.isna()]
y_class = y_class[~y_class.isna()]

# Print class distribution before SMOTE
print("\nClass distribution before SMOTE:")
print(y_class.value_counts())

# Split data
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42, stratify=y_class
)

from sklearn.calibration import CalibratedClassifierCV

rf_base = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight='balanced'
)

clf_pipeline = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', CalibratedClassifierCV(rf_base, method='sigmoid', cv=5))
])


# Train model
print("\nTraining model with SMOTE...")
clf_pipeline.fit(X_train_c, y_train_c)

# Predict
y_pred_c = clf_pipeline.predict(X_test_c)



# Evaluate
print("\nClassification Report:")
print(classification_report(y_test_c, y_pred_c))

print("Confusion Matrix:")
print(confusion_matrix(y_test_c, y_pred_c))

print("\nCatagory boundries:")
bins = pd.qcut(merged_data['future_return'], q=3, duplicates='drop')
print(bins.unique())




import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test_c, y_pred_c, labels=['Low', 'Medium', 'High'])
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Low', 'Medium', 'High'], yticklabels=['Low', 'Medium', 'High'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()



print("\n=== Stock Market Only Classification Model ===")

# Prepare stock market only features
# First, let's print all available features to debug
print("\nAvailable features:")
print(feature_cols)

# Create stock market only features
stock_market_features = [col for col in feature_cols if any(x in col.lower() for x in ['close', 'volume', 'open', 'high', 'low'])]
if not stock_market_features:  # If no direct stock market features, use a subset of features
    stock_market_features = feature_cols[:5]  # Use first 5 features as a proxy for stock market data

print("\nSelected stock market features:")
print(stock_market_features)

if not stock_market_features:
    print("\nNo direct stock market features found. Using alternative approach...")
    # Use only the first 5 features as a proxy for stock market data
    stock_market_features = feature_cols[:5]
    print("Using first 5 features as proxy:", stock_market_features)

# Create stock market only dataset
X_stock = merged_data[stock_market_features].copy()
y_stock = merged_data['return_class'].copy()

# Print shapes to verify
print("\nData shapes:")
print(f"X_stock shape: {X_stock.shape}")
print(f"y_stock shape: {y_stock.shape}")

# Drop NA targets if any
X_stock = X_stock[~y_stock.isna()]
y_stock = y_stock[~y_stock.isna()]

print("\nAfter dropping NA:")
print(f"X_stock shape: {X_stock.shape}")
print(f"y_stock shape: {y_stock.shape}")

# Print class distribution before SMOTE
print("\nClass distribution before SMOTE (Stock Market Only):")
print(y_stock.value_counts())

# Split data
X_train_stock, X_test_stock, y_train_stock, y_test_stock = train_test_split(
    X_stock, y_stock, test_size=0.2, random_state=42, stratify=y_stock
)

print("\nTrain/test split shapes:")
print(f"X_train_stock: {X_train_stock.shape}")
print(f"X_test_stock: {X_test_stock.shape}")

# Create and train stock market only model with SMOTE
rf_stock = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight='balanced'
)

stock_pipeline = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', CalibratedClassifierCV(rf_stock, method='sigmoid', cv=5))
])

# Train model
print("\nTraining stock market only model with SMOTE...")
stock_pipeline.fit(X_train_stock, y_train_stock)

# Predict
y_pred_stock = stock_pipeline.predict(X_test_stock)

# Evaluate
print("\nStock Market Only Model Classification Report:")
print(classification_report(y_test_stock, y_pred_stock))

print("\nStock Market Only Model Confusion Matrix:")
print(confusion_matrix(y_test_stock, y_pred_stock))

# Plot confusion matrix
plt.figure(figsize=(6, 5))
cm_stock = confusion_matrix(y_test_stock, y_pred_stock, labels=['Low', 'Medium', 'High'])
sns.heatmap(cm_stock, annot=True, fmt="d", cmap="Blues", 
            xticklabels=['Low', 'Medium', 'High'], 
            yticklabels=['Low', 'Medium', 'High'])
plt.title("Stock Market Only Model Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

# Compare feature importance
feature_importance_stock = pd.DataFrame({
    'Feature': stock_market_features,
    'Importance': stock_pipeline.named_steps['classifier'].base_estimator.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Stock Market Features:")
print(feature_importance_stock.head(10))

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(
    x='Importance',
    y='Feature',
    data=feature_importance_stock.head(10),
    palette='viridis'
)
plt.title('Top 10 Most Important Stock Market Features')
plt.tight_layout()
plt.show()

# Compare model performance
print("\n=== Model Comparison ===")
print("Full Model (All Features) vs Stock Market Only Model")

# Calculate metrics for both models
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Full model metrics
full_accuracy = accuracy_score(y_test_c, y_pred_c)
full_precision, full_recall, full_f1, _ = precision_recall_fscore_support(
    y_test_c, y_pred_c, average='weighted'
)

# Stock market only model metrics
stock_accuracy = accuracy_score(y_test_stock, y_pred_stock)
stock_precision, stock_recall, stock_f1, _ = precision_recall_fscore_support(
    y_test_stock, y_pred_stock, average='weighted'
)

print("\nFull Model Metrics:")
print(f"Accuracy: {full_accuracy:.4f}")
print(f"Precision: {full_precision:.4f}")
print(f"Recall: {full_recall:.4f}")
print(f"F1 Score: {full_f1:.4f}")

print("\nStock Market Only Model Metrics:")
print(f"Accuracy: {stock_accuracy:.4f}")
print(f"Precision: {stock_precision:.4f}")
print(f"Recall: {stock_recall:.4f}")
print(f"F1 Score: {stock_f1:.4f}")

print("\nImprovement from Stock Market Only to Full Model:")
print(f"Accuracy: {full_accuracy - stock_accuracy:.4f}")
print(f"Precision: {full_precision - stock_precision:.4f}")
print(f"Recall: {full_recall - stock_recall:.4f}")
print(f"F1 Score: {full_f1 - stock_f1:.4f}")

# Print feature sets to verify they're different
print("\nFeature Set Comparison:")
print("\nFull Model Features:", len(feature_cols))
print("Stock Market Only Features:", len(stock_market_features))
print("\nStock Market Features:", stock_market_features)




FileNotFoundError: [Errno 2] No such file or directory: '/Users/ilukerogers/Desktop/computerScience/MATH354/project/postings.csv'

In [5]:
import sklearn, imblearn
print("scikit-learn:", sklearn.__version__)
print("imbalanced-learn:", imblearn.__version__)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (/Users/benjaminscott/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py)