In [2]:
import sqlite3

db_file = "snippets.db"

try:
    # 1. Connect to the SQLite database
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    print("✅ Successfully connected to the database.")

    # 2. Get the name of the first table
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    table_name_tuple = cursor.fetchone()

    if table_name_tuple:
        table_name = table_name_tuple[0]
        print(f"\nFound table: '{table_name}'")

        # 3. Fetch only the first 5 rows from that table
        print("Fetching a small sample of 5 rows...")
        cursor.execute(f"SELECT * FROM {table_name} LIMIT 5;")
        
        rows = cursor.fetchall()
        
        # 4. Print the sample rows
        print("\nSample data extracted successfully:")
        for row in rows:
            print(row)
    else:
        print("\nNo tables found in the database.")

except Exception as e:
    print(f"❌ An error occurred: {e}")

finally:
    # 5. Close the connection
    if 'conn' in locals():
        conn.close()
        print("\nDatabase connection closed.")

✅ Successfully connected to the database.

Found table: 'snippets'
Fetching a small sample of 5 rows...

Sample data extracted successfully:
(478, 'dist/\nyarn.lock\nnpm-debug.log\nnode_modules/\nsftp-config.json\n', 'DOTFILE', 'NodeBB/NodeBB/.gitignore', 'https://github.com/NodeBB/NodeBB', 'GPL-3.0', '21634e2681fb1329bcbab7b2e19418ebdb1012e1\n', 0, 5)
(479, 'config.json\njsconfig.json\npublic/src/nodebb.min.js\n!src/views/config.json\npublic/css/*.css\n', 'DOTFILE', 'NodeBB/NodeBB/.gitignore', 'https://github.com/NodeBB/NodeBB', 'GPL-3.0', '21634e2681fb1329bcbab7b2e19418ebdb1012e1\n', 5, 5)
(480, '*.sublime-project\n*.sublime-workspace\n.project\n*.swp\nVagrantfile\n', 'DOTFILE', 'NodeBB/NodeBB/.gitignore', 'https://github.com/NodeBB/NodeBB', 'GPL-3.0', '21634e2681fb1329bcbab7b2e19418ebdb1012e1\n', 10, 5)
(481, '.vagrant\nprovision.sh\n*.komodoproject\n.DS_Store\nfeeds/recent.rss\n', 'DOTFILE', 'NodeBB/NodeBB/.gitignore', 'https://github.com/NodeBB/NodeBB', 'GPL-3.0', '21634e2681fb132

In [3]:
import pandas as pd

# Load your new, smaller sample file
df = pd.read_csv("github_snippets_sample.csv")

# Display the first 5 rows to verify it loaded correctly
df.head()

Unnamed: 0,id,snippet,language,repo_file_name,github_repo_url,license,commit_hash,starting_line_number,chunk_size
0,478,dist/\nyarn.lock\nnpm-debug.log\nnode_modules/...,DOTFILE,NodeBB/NodeBB/.gitignore,https://github.com/NodeBB/NodeBB,GPL-3.0,21634e2681fb1329bcbab7b2e19418ebdb1012e1\n,0,5
1,479,config.json\njsconfig.json\npublic/src/nodebb....,DOTFILE,NodeBB/NodeBB/.gitignore,https://github.com/NodeBB/NodeBB,GPL-3.0,21634e2681fb1329bcbab7b2e19418ebdb1012e1\n,5,5
2,480,*.sublime-project\n*.sublime-workspace\n.proje...,DOTFILE,NodeBB/NodeBB/.gitignore,https://github.com/NodeBB/NodeBB,GPL-3.0,21634e2681fb1329bcbab7b2e19418ebdb1012e1\n,10,5
3,481,.vagrant\nprovision.sh\n*.komodoproject\n.DS_S...,DOTFILE,NodeBB/NodeBB/.gitignore,https://github.com/NodeBB/NodeBB,GPL-3.0,21634e2681fb1329bcbab7b2e19418ebdb1012e1\n,15,5
4,482,.eslintcache\n.svn\n\nlogs/\n\n,DOTFILE,NodeBB/NodeBB/.gitignore,https://github.com/NodeBB/NodeBB,GPL-3.0,21634e2681fb1329bcbab7b2e19418ebdb1012e1\n,20,5


In [4]:
# Select only the 'snippet' and 'language' columns
df_clean = df[['snippet', 'language']].copy()

# Drop any rows where the snippet or language is missing
df_clean.dropna(inplace=True)

# Check the language distribution to see what we're working with
print("Language counts in the cleaned dataset:")
print(df_clean['language'].value_counts())

# Display the first 5 rows of the new clean DataFrame
df_clean.head()

Language counts in the cleaned dataset:
language
JSON          22012
JavaScript    19898
UNKNOWN        4036
YAML           2396
HTML           1542
DOTFILE          99
Markdown         17
Name: count, dtype: int64


Unnamed: 0,snippet,language
0,dist/\nyarn.lock\nnpm-debug.log\nnode_modules/...,DOTFILE
1,config.json\njsconfig.json\npublic/src/nodebb....,DOTFILE
2,*.sublime-project\n*.sublime-workspace\n.proje...,DOTFILE
3,.vagrant\nprovision.sh\n*.komodoproject\n.DS_S...,DOTFILE
4,.eslintcache\n.svn\n\nlogs/\n\n,DOTFILE


In [5]:
# 1. Define the specific programming languages you want to train on.
#    Let's select a few that have a good number of samples.
target_languages = ['JavaScript', 'HTML'] # Add others like 'Python', 'Java' if they exist

# 2. Filter the DataFrame to only include rows with these languages.
df_filtered = df_clean[df_clean['language'].isin(target_languages)]

# 3. Balance the dataset by taking an equal, smaller sample from each language.
#    (e.g., 1500 samples each to match the smallest category, HTML)
df_balanced = df_filtered.groupby('language').head(1500).reset_index(drop=True)


# Verify the new, balanced counts
print("Language counts in the final balanced dataset:")
print(df_balanced['language'].value_counts())

# Display the final prepared data
df_balanced.head()

Language counts in the final balanced dataset:
language
JavaScript    1500
HTML          1500
Name: count, dtype: int64


Unnamed: 0,snippet,language
0,'use strict';\n\nconst assert = require('asser...,JavaScript
1,const nconf = require('nconf');\n\nconst db = ...,JavaScript
2,const User = require('../src/user');\nconst so...,JavaScript
3,"\ndescribe('Groups', () => {\n\tlet adminUid;\...",JavaScript
4,\t\tconst navData = require('../install/data/n...,JavaScript


In [6]:
import sqlite3
import pandas as pd

db_file = "snippets.db"
rows_to_read = 500000 # Read a much larger initial sample

# --- 1. Load a larger sample from the database ---
print(f"Loading {rows_to_read} rows from '{db_file}'...")
try:
    conn = sqlite3.connect(db_file)
    df_large_sample = pd.read_sql_query(f"SELECT snippet, language FROM snippets LIMIT {rows_to_read}", conn)
    conn.close()
    print("✅ Sample loaded successfully.")
except Exception as e:
    print(f"❌ An error occurred: {e}")


# --- 2. Clean and filter for major languages ---
df_large_sample.dropna(inplace=True)

# Define the languages you want to include in your model
target_languages = [
    'JavaScript', 'HTML', 'CSS', 'Python', 'Java', 
    'C++', 'C', 'Shell', 'Ruby', 'Go'
]

df_filtered = df_large_sample[df_large_sample['language'].isin(target_languages)]


# --- 3. Balance the dataset ---
# Find the size of the smallest language category to balance against
min_count = df_filtered['language'].value_counts().min()
print(f"\nBalancing dataset to {min_count} samples per language.")

df_balanced = df_filtered.groupby('language').sample(n=min_count, random_state=42).reset_index(drop=True)

# --- 4. Verify the final dataset ---
print("\nLanguage counts in the final balanced dataset:")
print(df_balanced['language'].value_counts())

print("\nFinal dataset ready for training:")
display(df_balanced.head())

Loading 500000 rows from 'snippets.db'...
✅ Sample loaded successfully.

Balancing dataset to 63 samples per language.

Language counts in the final balanced dataset:
language
C             63
C++           63
HTML          63
Java          63
JavaScript    63
Python        63
Shell         63
Name: count, dtype: int64

Final dataset ready for training:


Unnamed: 0,snippet,language
0,"std::unordered_map<int64, ScopedAllocatorCon...",C
1,/* Copyright 2018 The TensorFlow Authors. All ...,C
2,DCHECK(valid());\n return batch_pointer...,C
3,"\n TensorEvaluator<ArgType, Device> m_impl;\n...",C
4,\njint netty_unix_limits_JNI_OnLoad(JNIEnv* en...,C


In [None]:
import sqlite3
import pandas as pd

db_file = "snippets.db"
rows_to_read = 2000000 # <-- Increased to 2 million

# --- 1. Load a larger sample from the database ---
print(f"Loading {rows_to_read} rows from '{db_file}'...")
try:
    conn = sqlite3.connect(db_file)
    df_large_sample = pd.read_sql_query(f"SELECT snippet, language FROM snippets LIMIT {rows_to_read}", conn)
    conn.close()
    print("✅ Sample loaded successfully.")
except Exception as e:
    print(f"❌ An error occurred: {e}")


# --- 2. Clean and filter for major languages ---
df_large_sample.dropna(inplace=True)

target_languages = [
    'JavaScript', 'HTML', 'CSS', 'Python', 'Java', 
    'C++', 'C', 'Shell', 'Ruby', 'Go',
    
]

df_filtered = df_large_sample[df_large_sample['language'].isin(target_languages)]


# --- 3. Balance the dataset ---
min_count = df_filtered['language'].value_counts().min()
print(f"\nBalancing dataset to {min_count} samples per language.")

df_balanced = df_filtered.groupby('language').sample(n=min_count, random_state=42).reset_index(drop=True)

# --- 4. Verify the final dataset ---
print("\nLanguage counts in the final balanced dataset:")
print(df_balanced['language'].value_counts())

print("\nFinal dataset ready for training:")
display(df_balanced.head())

Loading 2000000 rows from 'snippets.db'...
✅ Sample loaded successfully.

Balancing dataset to 1337 samples per language.

Language counts in the final balanced dataset:
language
C             1337
C++           1337
Go            1337
HTML          1337
Java          1337
JavaScript    1337
Python        1337
Ruby          1337
Shell         1337
Name: count, dtype: int64

Final dataset ready for training:


Unnamed: 0,snippet,language
0,"kDefault,\n };\n static constexpr const ...",C
1,#endif /* JEMALLOC_H_STRUCTS */\n/************...,C
2,\n private:\n BufferAssigner(bool allocate_bu...,C
3,\tswitch ((buf[2] >> 4) & 0xf) {\n\tcase 0x0:\...,C
4,se::DeviceMemoryAllocator* allocator);\n...,C


In [8]:
import sqlite3
import pandas as pd

db_file = "snippets.db"
rows_to_read = 2000000 # <-- Increased to 2 million

# --- 1. Load a larger sample from the database ---
print(f"Loading {rows_to_read} rows from '{db_file}'...")
try:
    conn = sqlite3.connect(db_file)
    df_large_sample = pd.read_sql_query(f"SELECT snippet, language FROM snippets LIMIT {rows_to_read}", conn)
    conn.close()
    print("✅ Sample loaded successfully.")
except Exception as e:
    print(f"❌ An error occurred: {e}")


# --- 2. Clean and filter for major languages ---
df_large_sample.dropna(inplace=True)

target_languages = [
    'JavaScript', 'HTML', 'CSS', 'Python', 'Java', 
    'C++', 'C', 'Shell', 'Ruby', 'Go', 'CSV', 'DOTFILE', 'JSON', 'Jupyter', 'Markdown', 
    'Powershell', 'Rust', 'TSV', 'Text', 'YAML'
    
]

df_filtered = df_large_sample[df_large_sample['language'].isin(target_languages)]


# --- 3. Balance the dataset ---
min_count = df_filtered['language'].value_counts().min()
print(f"\nBalancing dataset to {min_count} samples per language.")

df_balanced = df_filtered.groupby('language').sample(n=min_count, random_state=42).reset_index(drop=True)

# --- 4. Verify the final dataset ---
print("\nLanguage counts in the final balanced dataset:")
print(df_balanced['language'].value_counts())

print("\nFinal dataset ready for training:")
display(df_balanced.head())

Loading 2000000 rows from 'snippets.db'...
✅ Sample loaded successfully.

Balancing dataset to 149 samples per language.

Language counts in the final balanced dataset:
language
C             149
C++           149
CSV           149
DOTFILE       149
Go            149
HTML          149
JSON          149
Java          149
JavaScript    149
Jupyter       149
Markdown      149
Python        149
Ruby          149
Shell         149
Text          149
YAML          149
Name: count, dtype: int64

Final dataset ready for training:


Unnamed: 0,snippet,language
0,"kDefault,\n };\n static constexpr const ...",C
1,#endif /* JEMALLOC_H_STRUCTS */\n/************...,C
2,\n private:\n BufferAssigner(bool allocate_bu...,C
3,\tswitch ((buf[2] >> 4) & 0xf) {\n\tcase 0x0:\...,C
4,se::DeviceMemoryAllocator* allocator);\n...,C


In [9]:
import sqlite3
import pandas as pd

db_file = "snippets.db"
rows_to_read = 3000000 # <-- Increased to 2 million

# --- 1. Load a larger sample from the database ---
print(f"Loading {rows_to_read} rows from '{db_file}'...")
try:
    conn = sqlite3.connect(db_file)
    df_large_sample = pd.read_sql_query(f"SELECT snippet, language FROM snippets LIMIT {rows_to_read}", conn)
    conn.close()
    print("✅ Sample loaded successfully.")
except Exception as e:
    print(f"❌ An error occurred: {e}")


# --- 2. Clean and filter for major languages ---
df_large_sample.dropna(inplace=True)

target_languages = [
    'JavaScript', 'HTML', 'CSS', 'Python', 'Java', 
    'C++', 'C', 'Shell', 'Ruby', 'Go', 'CSV', 'DOTFILE', 'JSON', 'Jupyter', 'Markdown', 
    'Powershell', 'Rust', 'TSV', 'Text', 'YAML'
    
]

df_filtered = df_large_sample[df_large_sample['language'].isin(target_languages)]


# --- 3. Balance the dataset ---
min_count = df_filtered['language'].value_counts().min()
print(f"\nBalancing dataset to {min_count} samples per language.")

df_balanced = df_filtered.groupby('language').sample(n=min_count, random_state=42).reset_index(drop=True)

# --- 4. Verify the final dataset ---
print("\nLanguage counts in the final balanced dataset:")
print(df_balanced['language'].value_counts())

print("\nFinal dataset ready for training:")
display(df_balanced.head())

Loading 3000000 rows from 'snippets.db'...
✅ Sample loaded successfully.

Balancing dataset to 1541 samples per language.

Language counts in the final balanced dataset:
language
C             1541
C++           1541
CSV           1541
DOTFILE       1541
Go            1541
HTML          1541
JSON          1541
Java          1541
JavaScript    1541
Jupyter       1541
Markdown      1541
Python        1541
Ruby          1541
Shell         1541
Text          1541
YAML          1541
Name: count, dtype: int64

Final dataset ready for training:


Unnamed: 0,snippet,language
0,\t\t\trfcn.EndAddress = savedEndOff;\n\t\t}\n\...,C
1,\tret = R_NEW0 (pyc_object);\n\tif (!ret) {\n\...,C
2,simdjson_really_inline uint32_t json_iterator:...,C
3,virtual ::tensorflow::protobuf::io::ZeroCo...,C
4,/* Copyright 2017 The TensorFlow Authors. All ...,C


In [6]:
from sklearn.model_selection import train_test_split

# X is the feature (the code snippet)
X = df_balanced['snippet']
# y is the label (the programming language)
y = df_balanced['language']

# Split the data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

NameError: name 'df_balanced' is not defined

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
# max_features limits the vocabulary to the top 5000 most frequent words/tokens
tfidf_vectorizer = TfidfVectorizer(max_features=20000)

# Learn the vocabulary from the training data and transform it into vectors
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data using the same learned vocabulary
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Text vectorization complete.")
print("Shape of training data:", X_train_tfidf.shape)

NameError: name 'X_train' is not defined

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# 1. Initialize the Multinomial Naive Bayes classifier
model = MultinomialNB()

# 2. Train the model on your TF-IDF training data
print("Training the model...")
model.fit(X_train_tfidf, y_train)
print("✅ Model training complete.")

# 3. Make predictions on the test data
print("\nMaking predictions on the test data...")
y_pred = model.predict(X_test_tfidf)

# 4. Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")


Training the model...
✅ Model training complete.

Making predictions on the test data...

Model Accuracy: 73.03%


In [13]:
import joblib

# Save the trained TF-IDF vectorizer to a file
joblib.dump(tfidf_vectorizer, 'vectorizer.pkl')

# Save the trained Multinomial Naive Bayes model to a file
joblib.dump(model, 'language_classifier.pkl')

print("✅ Model and vectorizer saved to 'language_classifier.pkl' and 'vectorizer.pkl'")

✅ Model and vectorizer saved to 'language_classifier.pkl' and 'vectorizer.pkl'


In [1]:
import pandas as pd
df = pd.read_csv("github_snippets_sample.csv")
min_count = df['language'].value_counts().min()
df_balanced = df.groupby('language').sample(n=min_count, random_state=42)

# Verify the new counts
print(df_balanced['language'].value_counts())

language
C             8045
C++           8045
Go            8045
HTML          8045
Java          8045
JavaScript    8045
Python        8045
Ruby          8045
Shell         8045
Name: count, dtype: int64


In [2]:
import pandas as pd
df = pd.read_csv("github_snippets_sample.csv")
min_count = df['language'].value_counts().min()
df_balanced = df.groupby('language').sample(n=min_count, random_state=42)

# Verify the new counts
print(df_balanced['language'].value_counts())

language
C             1541
C++           1541
CSV           1541
Go            1541
HTML          1541
Java          1541
JavaScript    1541
Python        1541
Ruby          1541
Shell         1541
Text          1541
YAML          1541
Name: count, dtype: int64


In [3]:
import pandas as pd
df = pd.read_csv("github_snippets_sample.csv")
min_count = df['language'].value_counts().min()
df_balanced = df.groupby('language').sample(n=min_count, random_state=42)

# Verify the new counts
print(df_balanced['language'].value_counts())

language
C             1541
C++           1541
CSV           1541
Go            1541
HTML          1541
Java          1541
JavaScript    1541
Python        1541
Ruby          1541
Shell         1541
Text          1541
YAML          1541
Name: count, dtype: int64


In [7]:
import pandas as pd
df = pd.read_csv("github_snippets_sample.csv")
min_count = df['language'].value_counts().min()
df_balanced = df.groupby('language').sample(n=min_count, random_state=42)

# Verify the new counts
print(df_balanced['language'].value_counts())

language
C             2667
C++           2667
CSV           2667
Go            2667
HTML          2667
Java          2667
JavaScript    2667
Python        2667
Ruby          2667
Rust          2667
Shell         2667
Text          2667
YAML          2667
Name: count, dtype: int64


In [8]:
from sklearn.model_selection import train_test_split

X = df_balanced['snippet']
y = df_balanced['language']

# Split the data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# 1. Define the model and the parameters to test
model = LogisticRegression(max_iter=1000)
param_grid = {'C': [0.1, 1, 10]} # Test C values of 0.1, 1, and 10

# 2. Set up the grid search
grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)

# 3. Run the search on your training data
print("Starting hyperparameter tuning...")
grid_search.fit(X_train_tfidf, y_train)

# 4. Use the best model found by the search
print("Best parameters found:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# 5. Evaluate the new, optimized model
y_pred = best_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nOptimized Model Accuracy: {accuracy * 100:.2f}%")

Starting hyperparameter tuning...
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters found: {'C': 10}

Optimized Model Accuracy: 80.19%


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Look at single words, pairs of words, and triplets of words
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3), 
    max_features=25000
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [13]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

# 1. Define the parameters you want to test
param_grid = {
    'n_estimators': [200, 400],
    'learning_rate': [0.05, 0.1],
    'colsample_bytree': [0.8, 1.0],
}

# 2. Initialize the model and the grid search
lgbm = lgb.LGBMClassifier(objective='multiclass', metric='multi_logloss', n_jobs=-1)
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=3, verbose=2)

# 3. Run the search on your training data
print("Starting hyperparameter tuning for LightGBM...")
grid_search.fit(X_train_tfidf, y_train)
print("✅ Tuning complete.")

# 4. Use the best model found by the search
print("\nBest parameters found:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# 5. Evaluate the new, optimized model
y_pred = best_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nOptimized LightGBM Model Accuracy: {accuracy * 100:.2f}%")

Starting hyperparameter tuning for LightGBM...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.375471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65976
[LightGBM] [Info] Number of data points in the train set: 18490, number of used features: 2991
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightG



[CV] END colsample_bytree=0.8, learning_rate=0.05, n_estimators=200; total time=  53.4s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.144064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66042
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2955
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info



[CV] END colsample_bytree=0.8, learning_rate=0.05, n_estimators=200; total time=  45.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.162231 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66286
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2996
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info



[CV] END colsample_bytree=0.8, learning_rate=0.05, n_estimators=200; total time=  45.3s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.160570 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65976
[LightGBM] [Info] Number of data points in the train set: 18490, number of used features: 2991
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start tr



[CV] END colsample_bytree=0.8, learning_rate=0.05, n_estimators=400; total time= 1.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140644 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66042
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2955
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start tr



[CV] END colsample_bytree=0.8, learning_rate=0.05, n_estimators=400; total time= 1.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.177177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66286
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2996
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start tr



[CV] END colsample_bytree=0.8, learning_rate=0.05, n_estimators=400; total time= 1.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.132350 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65976
[LightGBM] [Info] Number of data points in the train set: 18490, number of used features: 2991
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start tr



[CV] END colsample_bytree=0.8, learning_rate=0.1, n_estimators=200; total time=  43.1s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.139104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66042
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2955
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info]



[CV] END colsample_bytree=0.8, learning_rate=0.1, n_estimators=200; total time=  42.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.135376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66286
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2996
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info]



[CV] END colsample_bytree=0.8, learning_rate=0.1, n_estimators=200; total time=  40.7s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.129624 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 65976
[LightGBM] [Info] Number of data points in the train set: 18490, number of used features: 2991
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info]



[CV] END colsample_bytree=0.8, learning_rate=0.1, n_estimators=400; total time= 1.3min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.152058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66042
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2955
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start tra



[CV] END colsample_bytree=0.8, learning_rate=0.1, n_estimators=400; total time= 1.3min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.145778 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66286
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2996
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start tra



[CV] END colsample_bytree=0.8, learning_rate=0.1, n_estimators=400; total time= 1.3min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.136527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65976
[LightGBM] [Info] Number of data points in the train set: 18490, number of used features: 2991
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start tra



[CV] END colsample_bytree=1.0, learning_rate=0.05, n_estimators=200; total time=  48.3s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.129814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66042
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2955
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info



[CV] END colsample_bytree=1.0, learning_rate=0.05, n_estimators=200; total time=  49.3s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.133225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66286
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2996
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start tr



[CV] END colsample_bytree=1.0, learning_rate=0.05, n_estimators=200; total time=  46.1s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.144914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65976
[LightGBM] [Info] Number of data points in the train set: 18490, number of used features: 2991
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start tr



[CV] END colsample_bytree=1.0, learning_rate=0.05, n_estimators=400; total time= 1.6min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140769 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66042
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2955
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start tr



[CV] END colsample_bytree=1.0, learning_rate=0.05, n_estimators=400; total time= 1.5min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.159413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66286
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2996
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info



[CV] END colsample_bytree=1.0, learning_rate=0.05, n_estimators=400; total time= 1.5min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.141381 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 65976
[LightGBM] [Info] Number of data points in the train set: 18490, number of used features: 2991
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info



[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=200; total time=  46.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.162997 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66042
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2955
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info]



[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=200; total time=  45.6s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.146568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66286
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2996
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start tra



[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=200; total time=  46.3s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.159210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65976
[LightGBM] [Info] Number of data points in the train set: 18490, number of used features: 2991
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.564463
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start training from score -2.565166
[LightGBM] [Info] Start tra



[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=400; total time= 1.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.138337 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66042
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2955
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start tra



[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=400; total time= 1.4min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.144268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66286
[LightGBM] [Info] Number of data points in the train set: 18491, number of used features: 2996
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.565220
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info] Start training from score -2.564517
[LightGBM] [Info]



[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=400; total time= 1.5min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.273934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 101676
[LightGBM] [Info] Number of data points in the train set: 27736, number of used features: 4117
[LightGBM] [Info] Start training from score -2.564733
[LightGBM] [Info] Start training from score -2.564733
[LightGBM] [Info] Start training from score -2.564733
[LightGBM] [Info] Start training from score -2.565202
[LightGBM] [Info] Start training from score -2.565202
[LightGBM] [Info] Start training from score -2.564733
[LightGBM] [Info] Start training from score -2.565202
[LightGBM] [Info] Start training from score -2.565202
[LightGBM] [Info] Start training from score -2.564733
[LightGBM] [Info] Start training from score -2.564733
[LightGBM] [Info] Start training from score -2.565202
[LightGBM] [Info] Start tr




Optimized LightGBM Model Accuracy: 75.85%


In [14]:
# ========================================
# IMPROVED MODEL WITH ENHANCED FEATURES
# ========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string
from collections import Counter

# Load and prepare the data
df = pd.read_csv("github_snippets_sample.csv")

# Enhanced text preprocessing function
def enhanced_preprocessing(text):
    if pd.isna(text):
        return ""
    
    text = str(text)
    
    # Remove excessive whitespace and normalize
    text = re.sub(r'\s+', ' ', text)
    
    # Keep important programming symbols but normalize
    text = re.sub(r'[{}();,]', lambda m: f' {m.group()} ', text)
    
    # Handle common programming patterns
    text = re.sub(r'(\w+)\s*\(', r'\1 (', text)  # function calls
    text = re.sub(r'(\w+)\s*\[', r'\1 [', text)  # array access
    text = re.sub(r'(\w+)\s*\{', r'\1 {', text)  # object/block start
    
    # Normalize numbers and strings
    text = re.sub(r'\b\d+\b', ' NUM ', text)
    text = re.sub(r'"[^"]*"', ' STRING ', text)
    text = re.sub(r"'[^']*'", ' STRING ', text)
    
    # Handle imports and includes
    text = re.sub(r'(import|from|include|require)\s+\w+', r'\1 MODULE', text)
    
    return text.strip()

# Apply enhanced preprocessing
print("Applying enhanced text preprocessing...")
df['processed_snippet'] = df['snippet'].apply(enhanced_preprocessing)

# Filter and balance the dataset
target_languages = [
    'JavaScript', 'HTML', 'CSS', 'Python', 'Java', 
    'C++', 'C', 'Shell', 'Ruby', 'Go', 'CSV', 'DOTFILE', 'JSON', 
    'Jupyter', 'Markdown', 'Powershell', 'Rust', 'TSV', 'Text', 'YAML'
]

df_filtered = df[df['language'].isin(target_languages)].copy()
df_filtered.dropna(inplace=True)

# Increase sample size per language for better training
min_samples = 2000  # Increased from 1541
df_balanced = df_filtered.groupby('language').apply(
    lambda x: x.sample(min(min_samples, len(x)), random_state=42)
).reset_index(drop=True)

print(f"\nEnhanced dataset prepared:")
print(f"Total samples: {len(df_balanced)}")
print(f"Samples per language:")
print(df_balanced['language'].value_counts().sort_index())


Applying enhanced text preprocessing...

Enhanced dataset prepared:
Total samples: 26000
Samples per language:
language
C             2000
C++           2000
CSV           2000
Go            2000
HTML          2000
Java          2000
JavaScript    2000
Python        2000
Ruby          2000
Rust          2000
Shell         2000
Text          2000
YAML          2000
Name: count, dtype: int64


  df_balanced = df_filtered.groupby('language').apply(


In [15]:
# Enhanced Feature Engineering with Multiple Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Create multiple feature sets for ensemble learning
def create_enhanced_features(X_train, X_test):
    features = {}
    
    # 1. Standard TF-IDF with optimized parameters
    tfidf_std = TfidfVectorizer(
        ngram_range=(1, 3),
        max_features=30000,
        min_df=2,
        max_df=0.95,
        sublinear_tf=True,
        lowercase=True
    )
    features['tfidf_std'] = {
        'train': tfidf_std.fit_transform(X_train),
        'test': tfidf_std.transform(X_test),
        'vectorizer': tfidf_std
    }
    
    # 2. Character-level TF-IDF for capturing syntax patterns
    tfidf_char = TfidfVectorizer(
        analyzer='char',
        ngram_range=(2, 5),
        max_features=15000,
        min_df=2
    )
    features['tfidf_char'] = {
        'train': tfidf_char.fit_transform(X_train),
        'test': tfidf_char.transform(X_test),
        'vectorizer': tfidf_char
    }
    
    # 3. Word-level TF-IDF with different n-gram ranges
    tfidf_word = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=25000,
        min_df=3,
        max_df=0.9,
        stop_words=None  # Keep programming keywords
    )
    features['tfidf_word'] = {
        'train': tfidf_word.fit_transform(X_train),
        'test': tfidf_word.transform(X_test),
        'vectorizer': tfidf_word
    }
    
    return features

# Prepare train/test split
X = df_balanced['processed_snippet']
y = df_balanced['language']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

# Create enhanced features
print("\nCreating enhanced feature sets...")
feature_sets = create_enhanced_features(X_train, X_test)
print("✅ Enhanced features created successfully!")


Training samples: 20800
Test samples: 5200

Creating enhanced feature sets...
✅ Enhanced features created successfully!


In [20]:
# Advanced Model Training with Ensemble Methods
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import time

def train_ensemble_models(feature_sets, y_train, y_test):
    results = {}
    
    # 1. Optimized Logistic Regression
    print("Training Logistic Regression with hyperparameter tuning...")
    lr_params = {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    }
    
    lr_best_scores = {}
    for name, features in feature_sets.items():
        lr = LogisticRegression(max_iter=1000, random_state=42)
        grid_search = GridSearchCV(lr, lr_params, cv=3, n_jobs=-1, scoring='accuracy')
        grid_search.fit(features['train'], y_train)
        lr_best_scores[name] = grid_search.best_estimator_
        y_pred = grid_search.predict(features['test'])
        accuracy = accuracy_score(y_test, y_pred)
        print(f"  {name}: {accuracy:.4f} (best params: {grid_search.best_params_})")
    
    # 2. Random Forest Classifier
    print("\nTraining Random Forest with hyperparameter tuning...")
    rf_params = {
        'n_estimators': [200, 300],
        'max_depth': [20, 30, None],
        'min_samples_split': [2, 5]
    }
    
    rf_best_scores = {}
    for name, features in feature_sets.items():
        rf = RandomForestClassifier(random_state=42, n_jobs=-1)
        grid_search = GridSearchCV(rf, rf_params, cv=3, n_jobs=-1, scoring='accuracy')
        grid_search.fit(features['train'], y_train)
        rf_best_scores[name] = grid_search.best_estimator_
        y_pred = grid_search.predict(features['test'])
        accuracy = accuracy_score(y_test, y_pred)
        print(f"  {name}: {accuracy:.4f} (best params: {grid_search.best_params_})")
    
    # 3. SVM Classifier (with probability=True for ensemble voting)
    print("\nTraining SVM with hyperparameter tuning...")
    svm_params = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }
    
    svm_best_scores = {}
    for name, features in feature_sets.items():
        svm = SVC(random_state=42, probability=True)  # Enable probability estimation
        grid_search = GridSearchCV(svm, svm_params, cv=3, n_jobs=-1, scoring='accuracy')
        grid_search.fit(features['train'], y_train)
        svm_best_scores[name] = grid_search.best_estimator_
        y_pred = grid_search.predict(features['test'])
        accuracy = accuracy_score(y_test, y_pred)
        print(f"  {name}: {accuracy:.4f} (best params: {grid_search.best_params_})")
    
    return {
        'logistic_regression': lr_best_scores,
        'random_forest': rf_best_scores,
        'svm': svm_best_scores
    }

# Train all models
print("Starting advanced model training...")
start_time = time.time()
trained_models = train_ensemble_models(feature_sets, y_train, y_test)
training_time = time.time() - start_time
print(f"\n✅ Training completed in {training_time:.2f} seconds")


Starting advanced model training...
Training Logistic Regression with hyperparameter tuning...




  tfidf_std: 0.8179 (best params: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'})




  tfidf_char: 0.8965 (best params: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'})




  tfidf_word: 0.8021 (best params: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'})

Training Random Forest with hyperparameter tuning...
  tfidf_std: 0.7769 (best params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 300})
  tfidf_char: 0.8737 (best params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300})
  tfidf_word: 0.7717 (best params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 300})

Training SVM with hyperparameter tuning...
  tfidf_std: 0.8167 (best params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'})
  tfidf_char: 0.9058 (best params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'})
  tfidf_word: 0.8062 (best params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'})

✅ Training completed in 40554.27 seconds


In [21]:
# Create Ensemble Model with Voting Classifier
from sklearn.ensemble import VotingClassifier
import numpy as np

def create_ensemble_model(trained_models, feature_sets, X_train, X_test, y_train, y_test):
    print("Creating ensemble model with voting classifier...")
    
    # Find the best performing model for each algorithm
    best_models = []
    
    # Get best Logistic Regression
    lr_scores = {}
    for name, model in trained_models['logistic_regression'].items():
        y_pred = model.predict(feature_sets[name]['test'])
        lr_scores[name] = accuracy_score(y_test, y_pred)
    best_lr_name = max(lr_scores, key=lr_scores.get)
    best_lr_model = trained_models['logistic_regression'][best_lr_name]
    print(f"Best LR: {best_lr_name} (accuracy: {lr_scores[best_lr_name]:.4f})")
    
    # Get best Random Forest
    rf_scores = {}
    for name, model in trained_models['random_forest'].items():
        y_pred = model.predict(feature_sets[name]['test'])
        rf_scores[name] = accuracy_score(y_test, y_pred)
    best_rf_name = max(rf_scores, key=rf_scores.get)
    best_rf_model = trained_models['random_forest'][best_rf_name]
    print(f"Best RF: {best_rf_name} (accuracy: {rf_scores[best_rf_name]:.4f})")
    
    # Get best SVM
    svm_scores = {}
    for name, model in trained_models['svm'].items():
        y_pred = model.predict(feature_sets[name]['test'])
        svm_scores[name] = accuracy_score(y_test, y_pred)
    best_svm_name = max(svm_scores, key=svm_scores.get)
    best_svm_model = trained_models['svm'][best_svm_name]
    print(f"Best SVM: {best_svm_name} (accuracy: {svm_scores[best_svm_name]:.4f})")
    
    # Create ensemble with best feature set (tfidf_std typically performs best)
    best_feature_set = 'tfidf_std'
    
    # Create a pipeline that combines the best models
    ensemble = VotingClassifier(
        estimators=[
            ('lr', best_lr_model),
            ('rf', best_rf_model),
            ('svm', best_svm_model)
        ],
        voting='soft'  # Use predicted probabilities for voting
    )
    
    # Train the ensemble
    ensemble.fit(feature_sets[best_feature_set]['train'], y_train)
    
    # Evaluate ensemble
    y_pred_ensemble = ensemble.predict(feature_sets[best_feature_set]['test'])
    ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
    
    print(f"\n🎯 ENSEMBLE MODEL ACCURACY: {ensemble_accuracy:.4f} ({ensemble_accuracy*100:.2f}%)")
    
    return ensemble, feature_sets[best_feature_set]['vectorizer'], ensemble_accuracy

# Create the final ensemble model
final_ensemble, final_vectorizer, final_accuracy = create_ensemble_model(
    trained_models, feature_sets, X_train, X_test, y_train, y_test
)


Creating ensemble model with voting classifier...
Best LR: tfidf_char (accuracy: 0.8965)
Best RF: tfidf_char (accuracy: 0.8737)
Best SVM: tfidf_char (accuracy: 0.9058)





🎯 ENSEMBLE MODEL ACCURACY: 0.8187 (81.87%)


In [24]:
# Cross-Validation and Detailed Evaluation (No Visualization Dependencies)
from sklearn.model_selection import cross_val_score

def comprehensive_evaluation(ensemble, vectorizer, X_train, y_train, X_test, y_test):
    print("="*60)
    print("COMPREHENSIVE MODEL EVALUATION")
    print("="*60)
    
    # 1. Cross-validation scores
    print("\n1. Cross-Validation Analysis:")
    cv_scores = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='accuracy')
    print(f"   CV Mean Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"   Individual CV scores: {cv_scores}")
    
    # 2. Detailed classification report
    print("\n2. Detailed Classification Report:")
    y_pred = ensemble.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    # 3. Confusion Matrix (Text-based)
    print("\n3. Confusion Matrix Summary:")
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Print confusion matrix for top languages
    unique_labels = sorted(y_test.unique())
    print(f"   Languages: {unique_labels}")
    print(f"   Matrix shape: {cm.shape}")
    print(f"   Total correct predictions: {cm.trace()}")
    print(f"   Total predictions: {cm.sum()}")
    
    # 4. Per-language accuracy
    print("\n4. Per-Language Accuracy:")
    from sklearn.metrics import accuracy_score
    for language in sorted(y_test.unique()):
        mask = y_test == language
        if mask.sum() > 0:
            lang_accuracy = accuracy_score(y_test[mask], y_pred[mask])
            print(f"   {language:15s}: {lang_accuracy:.4f} ({mask.sum()} samples)")
    
    return cv_scores.mean()

# Run comprehensive evaluation (pass the correct feature data)
# We need to use the same features that the ensemble was trained on
best_feature_set = 'tfidf_std'
X_train_features = feature_sets[best_feature_set]['train']
X_test_features = feature_sets[best_feature_set]['test']

cv_mean_score = comprehensive_evaluation(final_ensemble, final_vectorizer, X_train_features, y_train, X_test_features, y_test)

print(f"\n🏆 FINAL MODEL PERFORMANCE SUMMARY:")
print(f"   Test Accuracy: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")
print(f"   CV Mean Score: {cv_mean_score:.4f} ({cv_mean_score*100:.2f}%)")
print(f"   Improvement over baseline: +{(final_accuracy - 0.8019)*100:.2f}%")


COMPREHENSIVE MODEL EVALUATION

1. Cross-Validation Analysis:




   CV Mean Accuracy: 0.8293 (+/- 0.0118)
   Individual CV scores: [0.82067308 0.83293269 0.83076923 0.83725962 0.82475962]

2. Detailed Classification Report:
              precision    recall  f1-score   support

           C       0.63      0.64      0.63       400
         C++       0.69      0.67      0.68       400
         CSV       0.90      0.97      0.93       400
          Go       0.82      0.74      0.78       400
        HTML       0.80      0.90      0.84       400
        Java       0.87      0.81      0.84       400
  JavaScript       0.70      0.79      0.74       400
      Python       0.80      0.80      0.80       400
        Ruby       0.93      0.88      0.91       400
        Rust       0.94      0.92      0.93       400
       Shell       0.87      0.90      0.88       400
        Text       0.83      0.75      0.79       400
        YAML       0.90      0.87      0.89       400

    accuracy                           0.82      5200
   macro avg       0.82      

In [25]:
# Save the Improved Model and Vectorizer
import joblib
import pickle

# Save the enhanced ensemble model
joblib.dump(final_ensemble, 'enhanced_language_classifier.pkl')

# Save the enhanced vectorizer
joblib.dump(final_vectorizer, 'enhanced_vectorizer.pkl')

# Also save the preprocessing function for use in the Flask app
def save_preprocessing_function():
    import inspect
    source = inspect.getsource(enhanced_preprocessing)
    with open('preprocessing_function.py', 'w') as f:
        f.write(source)

save_preprocessing_function()

print("✅ Enhanced model and vectorizer saved successfully!")
print("   - enhanced_language_classifier.pkl (ensemble model)")
print("   - enhanced_vectorizer.pkl (enhanced vectorizer)")
print("   - preprocessing_function.py (preprocessing function)")

# Test the saved model
print("\n🧪 Testing saved model...")
test_model = joblib.load('enhanced_language_classifier.pkl')
test_vectorizer = joblib.load('enhanced_vectorizer.pkl')

# Test with a sample
sample_text = "def hello_world():\n    print('Hello, World!')\n    return True"
processed_sample = enhanced_preprocessing(sample_text)
sample_vector = test_vectorizer.transform([processed_sample])
prediction = test_model.predict(sample_vector)[0]
confidence = test_model.predict_proba(sample_vector).max()

print(f"Sample prediction: {prediction} (confidence: {confidence:.3f})")
print("✅ Model loading and prediction test successful!")


✅ Enhanced model and vectorizer saved successfully!
   - enhanced_language_classifier.pkl (ensemble model)
   - enhanced_vectorizer.pkl (enhanced vectorizer)
   - preprocessing_function.py (preprocessing function)

🧪 Testing saved model...
Sample prediction: Python (confidence: 0.793)
✅ Model loading and prediction test successful!
