Mounted at /content/drive


In [None]:
!pip install pandas scikit-learn flask numpy



In [None]:
!pip install flask-ngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
# Import the ngrok helper
from flask_ngrok import run_with_ngrok
from flask import Flask, request, jsonify
import pickle
import numpy as np
import pandas as pd

# Initialize the Flask application
app = Flask(__name__)

# Tell ngrok to run with our app
run_with_ngrok(app)

# --- Load the trained model and the label encoder ---
print("Loading model and encoder...")
model = pickle.load(open('career_model.pkl', 'rb'))
label_encoder = pickle.load(open('label_encoder.pkl', 'rb'))
print("Model and encoder loaded successfully.")

# Load training columns to ensure order
train_columns = pd.read_csv('user_skill_career.csv').drop('Career', axis=1).columns.tolist()

@app.route('/predict_career', methods=['POST'])
def predict_career():
    try:
        data = request.json
        input_skills = [data[col] for col in train_columns]
        skills_array = np.array(input_skills).reshape(1, -1)
        prediction_encoded = model.predict(skills_array)
        probabilities = model.predict_proba(skills_array)
        confidence = np.max(probabilities) * 100
        predicted_career = label_encoder.inverse_transform(prediction_encoded)[0]

        if confidence > 90:
            forecast_category = "Strong Match"
        elif 75 < confidence <= 90:
            forecast_category = "Good Match"
        else:
            forecast_category = "Potential Match"

        response = {
            'predicted_career': predicted_career,
            'confidence_score': f"{confidence:.2f}%",
            'forecast_category': forecast_category
        }
        return jsonify(response)
    except Exception as e:
        return jsonify({'error': str(e)}), 400

# This starts the server
app.run()

Loading model and encoder...


FileNotFoundError: [Errno 2] No such file or directory: 'career_model.pkl'

In [None]:
# ==============================================
# ðŸ““ Notebook 2: Pretrained CNN Models (Modified)
# ==============================================

import os
import time
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50, MobileNetV2, DenseNet121, EfficientNetB0, InceptionV3, VGG16
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import mixed_precision
from sklearn.metrics import classification_report

# âš¡ Enable mixed precision
mixed_precision.set_global_policy("mixed_float16")

# Dataset path
data_dir = "/content/drive/MyDrive/PBL3/data/chest_xray"

# Parameters
img_size = (224, 224)
default_batch_size = 32
epochs = 50

# Data generators
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

# -----------------------
# Custom Callback to log results at specific epochs
# -----------------------
class EpochLoggerPretrained(tf.keras.callbacks.Callback):
    def __init__(self, model_name, test_generator, results_df, epochs_to_log=[10,20,30,40,50]):
        super().__init__()
        self.model_name = model_name
        self.test_generator = test_generator
        self.results_df = results_df
        self.epochs_to_log = epochs_to_log

    def on_epoch_end(self, epoch, logs=None):
        epoch_num = epoch + 1
        if epoch_num in self.epochs_to_log:
            preds = self.model.predict(self.test_generator, verbose=0)
            y_pred = (preds > 0.5).astype(int)
            y_true = self.test_generator.classes

            report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)

            accuracy = report["accuracy"]
            precision = report["1"]["precision"]
            recall = report["1"]["recall"]
            f1 = report["1"]["f1-score"]

            params = self.model.count_params() / 1e6
            train_time = (time.time() - self.model.start_time) / 60

            self.results_df = pd.concat([self.results_df, pd.DataFrame([{
                "Model": self.model_name,
                "Epoch": epoch_num,
                "Accuracy": accuracy,
                "F1-score": f1,
                "Precision": precision,
                "Recall": recall,
                "Params (M)": params,
                "Train Time (mins)": train_time
            }])], ignore_index=True)

            self.results_df.to_csv("/content/pretrained_results_summary.csv", index=False)
            print(f"âœ… Logged and saved results for {self.model_name} at epoch {epoch_num}")

# -----------------------
# Pretrained models list
# -----------------------
models_to_run = {
    "ResNet50": ResNet50
    # "MobileNetV2": MobileNetV2,
    # "DenseNet121": DenseNet121,
    # "EfficientNetB0": EfficientNetB0,
    # "InceptionV3": InceptionV3,
    # "VGG16": VGG16
}

# Results file
results_file = "/content/pretrained_results_summary.csv"
if os.path.exists(results_file):
    results_df = pd.read_csv(results_file)
else:
    results_df = pd.DataFrame(columns=["Model", "Epoch", "Accuracy", "F1-score", "Precision", "Recall", "Params (M)", "Train Time (mins)"])

# -----------------------
# Training loop
# -----------------------
for model_name, model_fn in models_to_run.items():
    if model_name in results_df["Model"].values:
        print(f"âœ… Skipping {model_name}, already trained.")
        continue

    print(f"\nðŸš€ Training {model_name}...")

    # Adjust batch size for heavy models
    batch_size = 16 if model_name in ["InceptionV3", "VGG16", "DenseNet121"] else default_batch_size

    train_generator = train_datagen.flow_from_directory(
        os.path.join(data_dir, "train"),
        target_size=img_size,
        batch_size=batch_size,
        class_mode="binary"
    )

    val_generator = val_datagen.flow_from_directory(
        os.path.join(data_dir, "val"),
        target_size=img_size,
        batch_size=batch_size,
        class_mode="binary"
    )

    test_generator = test_datagen.flow_from_directory(
        os.path.join(data_dir, "test"),
        target_size=img_size,
        batch_size=1,
        class_mode="binary",
        shuffle=False
    )

    base_model = model_fn(weights="imagenet", include_top=False, input_shape=img_size + (3,))
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.5)(x)
    predictions = Dense(1, activation="sigmoid", dtype="float32")(x)

    model = Model(inputs=base_model.input, outputs=predictions)

    for layer in base_model.layers:
        layer.trainable = False

    model.compile(optimizer=keras.optimizers.Adam(),
                  loss="binary_crossentropy",
                  metrics=["accuracy"])

    # Track start time
    model.start_time = time.time()

    # Train with callback
    model.fit(train_generator,
              epochs=epochs,
              validation_data=val_generator,
              verbose=1,
              callbacks=[EpochLoggerPretrained(model_name, test_generator, results_df)])

print("\nðŸŽ‰ All models finished!")
print(results_df)



ðŸš€ Training ResNet50...


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/PBL3/data/chest_xray/train'

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# === Step 1: Hardcoded dataset with more careers ===
df = pd.DataFrame({
    'career': [
        'Software Engineer', 'Full-Stack Developer', 'Mobile App Developer',
        'Cloud Architect', 'Data Engineer', 'Machine Learning Engineer',
        'Computer Vision Specialist', 'Cybersecurity Consultant',
        'Network Administrator', 'System Administrator', 'IoT Engineer',
        'Automation Engineer', 'Business Analyst', 'Product Manager',
        'Graphic Designer', 'UX/UI Designer', 'Video Producer',
        'Content Creator', 'SEO Specialist', 'Social Media Manager',
        'Financial Controller', 'Operations Manager', 'Risk Analyst',
        'Technical Support Specialist', 'Customer Support Executive'
    ],
    'skills': [
        'Java, C++, Python, OOP, Debugging, DevOps',
        'Node.js, HTML, CSS, React, Express, REST APIs',
        'Swift, Kotlin, Android, iOS, UI/UX',
        'AWS, Azure, GCP, Docker, Kubernetes, Cloud Infrastructure',
        'ETL, Data Warehousing, SQL Server, Hadoop, Spark',
        'Python, TensorFlow, PyTorch, Data Modelling, NLP',
        'OpenCV, Deep Learning, Neural Networks, Image Processing',
        'Information Security, Penetration Testing, IDS, Firewalls',
        'Network Protocols, VPNs, Linux, Router Configuration',
        'Unix, Active Directory, Server Management, Backup',
        'IoT, Embedded Systems, Hardware, Cloud Integration',
        'Robotics, PLC, Automation Systems, Scripting',
        'Business Analysis, Data Collection, Forecasting, SPSS',
        'Product Development, Agile, UI, Market Research',
        'Illustrator, Photoshop, Adobe Creative Suite, Typography',
        'Wireframing, Figma, InVision, User Research, Accessibility',
        'Videography, Editing, Audio, Lighting, Storyboarding',
        'Storytelling, Social Media, SEO, Copywriting',
        'SEO, Google Analytics, Keyword Research',
        'Social Media Marketing, Content Planning, Influencer Marketing',
        'Accounting, Finance Management, Budgeting, Forecasting',
        'Operation Management, Supply Chain, Process Improvement',
        'Risk Management, Financial Modelling, Statistics, Reporting',
        'Problem Solving, Troubleshooting, Documentation',
        'Communication, CRM, Ticketing Tools'
    ]
})

# === Step 2: Vectorize the skills ===
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['skills'])

# === Step 3: Hardcoded user skill input ===
user_input = "Python, TensorFlow, Deep Learning, Data Modelling, NLP, Spark"
user_vec = vectorizer.transform([user_input])

# === Step 4: Calculate similarity ===
similarities = cosine_similarity(user_vec, X).flatten()

# === Step 5: Append match % and rank ===
df['match_percentage'] = (similarities * 100).round(2)
df = df.sort_values(by='match_percentage', ascending=False)

# === Step 6: Display results ===
print(f"Skill input: {user_input}\n")
print("Recommended Careers based on skills:\n")
print(df[['career', 'match_percentage']])


Skill input: Python, TensorFlow, Deep Learning, Data Modelling, NLP, Spark

Recommended Careers based on skills:

                          career  match_percentage
5      Machine Learning Engineer             68.74
6     Computer Vision Specialist             28.13
4                  Data Engineer             24.19
0              Software Engineer             13.36
22                  Risk Analyst             12.65
12              Business Analyst             10.36
2           Mobile App Developer              0.00
3                Cloud Architect              0.00
1           Full-Stack Developer              0.00
8          Network Administrator              0.00
7       Cybersecurity Consultant              0.00
10                  IoT Engineer              0.00
9           System Administrator              0.00
13               Product Manager              0.00
14              Graphic Designer              0.00
15                UX/UI Designer              0.00
11           Automa

In [None]:
import pandas as pd

# Load dataset from CSV file (upload this CSV to Colab or local folder)
df = pd.read_csv('career_skills_dataset.csv')

# Convert the Required_Skills string (comma-separated) into list for each row
df['Required_Skills'] = df['Required_Skills'].apply(lambda x: [skill.strip() for skill in x.split(',')])

def match_careers(user_skills):
    results = []
    user_skills_set = set([skill.strip().lower() for skill in user_skills])

    for idx, row in df.iterrows():
        req_skills = set([skill.lower() for skill in row['Required_Skills']])
        matched_skills = user_skills_set.intersection(req_skills)
        match_percentage = (len(matched_skills) / len(req_skills)) * 100 if req_skills else 0

        result = {
            'Career': row['Career'],
            'Matched_Skills': list(matched_skills),
            'Required_Skills': list(req_skills),
            'Match_Percentage': round(match_percentage, 2)
        }
        results.append(result)

    # Sort by highest match percentage
    results = sorted(results, key=lambda x: x['Match_Percentage'], reverse=True)
    return results

# Example usage
user_input_skills = ['Python', 'Statistics', 'HTML', 'AWS', 'Leadership']
career_matches = match_careers(user_input_skills)

for career in career_matches:
    print(f"Career: {career['Career']}")
    print(f"Matched Skills: {career['Matched_Skills']}")
    print(f"Required Skills: {career['Required_Skills']}")
    print(f"Match Percentage: {career['Match_Percentage']}%\n")


Career: Data Scientist
Matched Skills: ['statistics', 'python']
Required Skills: ['statistics', 'python', 'data analysis', 'machine learning', 'sql']
Match Percentage: 40.0%

Career: Project Manager
Matched Skills: ['leadership']
Required Skills: ['risk management', 'agile', 'leadership', 'communication']
Match Percentage: 25.0%

Career: Product Manager
Matched Skills: ['leadership']
Required Skills: ['agile', 'roadmapping', 'leadership', 'market research']
Match Percentage: 25.0%

Career: Data Analyst
Matched Skills: ['statistics']
Required Skills: ['excel', 'sql', 'statistics', 'visualization']
Match Percentage: 25.0%

Career: Machine Learning Engineer
Matched Skills: ['python']
Required Skills: ['tensorflow', 'data science', 'machine learning', 'python']
Match Percentage: 25.0%

Career: Statistician
Matched Skills: ['statistics']
Required Skills: ['reporting', 'data analysis', 'statistics', 'r']
Match Percentage: 25.0%

Career: Back End Developer
Matched Skills: ['python']
Required 

In [1]:
from flask import Flask, request, jsonify
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
import os

app = Flask(__name__)

# Load dataset
df = pd.read_csv('career_skills_dataset.csv')

# Preprocess skills
df['Required_Skills'] = df['Required_Skills'].apply(lambda x: [skill.strip().lower() for skill in x.split(',')])
df['Required_Skills_Str'] = df['Required_Skills'].apply(lambda x: ' '.join(x))  # For vectorization

# Multi-label binarizer for career labels
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform([[career] for career in df['Career']])  # each career is a label

# Vectorizer + ML model pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear')))
])

# Train the model
model.fit(df['Required_Skills_Str'], Y)

# Save the model and label binarizer (optional)
joblib.dump(model, 'career_model.pkl')
joblib.dump(mlb, 'label_binarizer.pkl')

@app.route('/match-careers', methods=['POST'])
def match_careers():
    data = request.get_json()
    user_skills = data.get('skills', [])

    if not user_skills:
        return jsonify({'error': 'No skills provided'}), 400

    # Preprocess user input
    user_input = ' '.join([skill.strip().lower() for skill in user_skills])

    # Predict using the trained model
    preds = model.predict([user_input])
    matched_careers = mlb.inverse_transform(preds)

    return jsonify({
        'input_skills': user_skills,
        'matched_careers': matched_careers[0] if matched_careers else []
    })

if __name__ == '__main__':
    app.run(debug=True, port=8000, host='0.0.0.0')


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8000
 * Running on http://172.28.0.12:8000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with watchdog (inotify)
