<a href="https://colab.research.google.com/github/Yuvi0503/AI-Code-Reviewer/blob/main/AI_code_review_tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install xgboost



In [None]:
import nltk
import ast
import numpy as np
import pandas as pd
import random
from nltk.tokenize import word_tokenize
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download necessary NLTK data
nltk.download("punkt")

# === Expanded Sample Dataset ===
# === GOOD CODE EXAMPLES ===
good_code_samples = [
    ("def add(a, b):\n    return a + b", "Good"),
    ("class Calculator:\n    def multiply(self, x, y):\n        return x * y", "Good"),
    ("def factorial(n):\n    return 1 if n == 0 else n * factorial(n-1)", "Good"),
    ("def is_even(n):\n    return n % 2 == 0", "Good"),
    ("def greet(name):\n    print(f'Hello, {name}!')", "Good"),
    ("def square_list(numbers):\n    return [x**2 for x in numbers]", "Good"),
    ("class Person:\n    def __init__(self, name, age):\n        self.name = name\n        self.age = age", "Good"),
    ("def read_file(filename):\n    with open(filename, 'r') as file:\n        return file.read()", "Good"),
    ("def fibonacci(n):\n    a, b = 0, 1\n    for _ in range(n):\n        a, b = b, a + b\n    return a", "Good"),
    ("def reverse_string(s):\n    return s[::-1]", "Good"),
]

# === BAD CODE EXAMPLES ===
bad_code_samples = [
    ("def add(a, b)\n    return a + b", "Bad"),  # Missing colon
    ("class Calculator:\n def multiply(self, x, y) return x * y", "Bad"),  # Missing colon
    ("def factorial(n):\n    if n = 0:\n        return 1\n    return n * factorial(n-1)", "Bad"),  # Syntax error (= instead of ==)
    ("def is_even(n):\n    if n % 2:\n        return False\n    else n % 2 == 0:\n        return True", "Bad"),  # Wrong logic
    ("def greet(name):\n    print('Hello, ' + name)", "Bad"),  # Missing f-string
    ("def square_list(numbers):\n    result = []\n    for num in numbers:\n        result.append(num**2)", "Bad"),  # No return statement
    ("class Person:\n    def __init__(self, name, age):\n    self.name = name\n    self.age = age", "Bad"),  # Indentation error
    ("def read_file(filename):\n    file = open(filename, 'r')\n    content = file.read()\n    return content", "Bad"),  # No file close
    ("def fibonacci(n):\n    a = 0\n    b = 1\n    for i in range(n):\n        sum = a + b\n        a = b\n        b = sum", "Bad"),  # No return
    ("def reverse_string(s):\n    return s.reverse()", "Bad"),  # `reverse()` doesn’t work on strings
]

# === EXPAND TO 100 SAMPLES ===
while len(good_code_samples) < 50:
    good_code_samples.append(random.choice(good_code_samples))
while len(bad_code_samples) < 50:
    bad_code_samples.append(random.choice(bad_code_samples))

# Combine all data
data = good_code_samples + bad_code_samples
random.shuffle(data)  # Shuffle to mix Good & Bad samples

df = pd.DataFrame(data, columns=["code", "label"])

# === Feature Extraction Functions ===
def tokenize_code(code):
    """Tokenizes the code without removing stopwords."""
    return word_tokenize(code)

def extract_ast_features(code):
    """Extracts function names, class names, and variables with error handling."""
    try:
        tree = ast.parse(code)
        functions = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
        classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
        variables = [node.id for node in ast.walk(tree) if isinstance(node, ast.Name)]
        return " ".join(functions + classes + variables)

    except SyntaxError:
        print(f"⚠️ Syntax error in code:\n{code}")
        return ""

# Apply tokenization and AST feature extraction
df["tokens"] = df["code"].apply(tokenize_code)
df["ast_features"] = df["code"].apply(extract_ast_features)

# Combine tokenized words and AST features
df["processed_text"] = df["tokens"].apply(lambda x: " ".join(x)) + " " + df["ast_features"]

# === Convert Text into Numerical Features ===
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["processed_text"])
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")  # Save fitted vectorizer

y = df["label"].map({"Good": 1, "Bad": 0})  # Convert labels to binary

# === Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# === Handle Class Imbalance with SMOTE ===
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


# === Train XGBoost Classifier ===
clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)

# Save the trained XGBoost model
joblib.dump(clf, "xgboost_code_review.pkl")

# === Model Evaluation ===
y_pred = clf.predict(X_test)
print("\n🔹 Accuracy:", accuracy_score(y_test, y_pred))
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))

⚠️ Syntax error in code:
class Calculator:
 def multiply(self, x, y) return x * y
⚠️ Syntax error in code:
def factorial(n):
    if n = 0:
        return 1
    return n * factorial(n-1)
⚠️ Syntax error in code:
def factorial(n):
    if n = 0:
        return 1
    return n * factorial(n-1)
⚠️ Syntax error in code:
class Calculator:
 def multiply(self, x, y) return x * y
⚠️ Syntax error in code:
class Calculator:
 def multiply(self, x, y) return x * y
⚠️ Syntax error in code:
def factorial(n):
    if n = 0:
        return 1
    return n * factorial(n-1)
⚠️ Syntax error in code:
class Calculator:
 def multiply(self, x, y) return x * y
⚠️ Syntax error in code:
def add(a, b)
    return a + b
⚠️ Syntax error in code:
class Calculator:
 def multiply(self, x, y) return x * y
⚠️ Syntax error in code:
class Person:
    def __init__(self, name, age):
    self.name = name
    self.age = age
⚠️ Syntax error in code:
class Calculator:
 def multiply(self, x, y) return x * y
⚠️ Syntax error in code:
c

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Parameters: { "use_label_encoder" } are not used.



In [None]:
import joblib

class CodeReviewModel:
    def __init__(self):
        """Load the trained XGBoost model and TF-IDF vectorizer."""
        self.model = joblib.load("xgboost_code_review.pkl")
        self.vectorizer = joblib.load("tfidf_vectorizer.pkl")

    def predict(self, code_snippet: str) -> str:
        """Predict if a code snippet is 'Good' or 'Bad'."""
        X_test = self.vectorizer.transform([code_snippet])

        # Ensure feature size matches model expectations
        if X_test.shape[1] != self.model.n_features_in_:
            raise ValueError(f"❌ Feature shape mismatch! Model expects {self.model.n_features_in_}, but got {X_test.shape[1]}.")

        prediction = self.model.predict(X_test)
        return "Good" if prediction[0] == 1 else "Bad"

# === TEST SCRIPT ===
if __name__ == "__main__":
    code_review = CodeReviewModel()
    test_code = "def add(a, b): return a + b"
    print(f"🚀 Prediction: {code_review.predict(test_code)}")


🚀 Prediction: Bad


In [None]:
code = """
# Your full training script goes here...
import nltk
import ast
import numpy as np
import pandas as pd
import random
import joblib
from nltk.tokenize import word_tokenize
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

nltk.download("punkt")

good_code_samples = [("def add(a, b):\n    return a + b", "Good")]
bad_code_samples = [("def add(a, b)\n    return a + b", "Bad")]

while len(good_code_samples) < 50:
    good_code_samples.append(random.choice(good_code_samples))
while len(bad_code_samples) < 50:
    bad_code_samples.append(random.choice(bad_code_samples))

data = good_code_samples + bad_code_samples
random.shuffle(data)

df = pd.DataFrame(data, columns=["code", "label"])

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["code"])
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

y = df["label"].map({"Good": 1, "Bad": 0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train_smote, y_train_smote)

joblib.dump(clf, "xgboost_code_review.pkl")

print("\n✅ Model & vectorizer saved successfully!")
"""

# Save the script
with open("train.py", "w") as file:
    file.write(code)

print("✅ train.py created successfully!")


✅ train.py created successfully!


In [None]:
from google.colab import files

# Download the training script
files.download("train.py")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!python train.py

python3: can't open file '/content/train.py': [Errno 2] No such file or directory


In [None]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Save the trained model
joblib.dump(clf, "xgboost_code_review.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("✅ Model and vectorizer saved!")


✅ Model and vectorizer saved!


In [None]:
from google.colab import files

# Download the model and vectorizer
files.download("xgboost_code_review.pkl")
files.download("tfidf_vectorizer.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Example dataset (replace with your full dataset)
data = [
    "def add(a, b): return a + b",
    "class Calculator: def multiply(self, x, y): return x * y",
    "def factorial(n): return 1 if n == 0 else n * factorial(n-1)",
    "def is_even(n): return n % 2 == 0",
    "x = 5 y = x + 2 print(y)"  # Bad code
]

# Fit TF-IDF vectorizer on the full dataset
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(data)  # Fit on all training examples

# Save the trained vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("✅ TF-IDF Vectorizer saved successfully!")


✅ TF-IDF Vectorizer saved successfully!


In [None]:
import joblib

joblib.dump(clf, "xgboost_code_review (1).pkl")
joblib.dump(vectorizer, "tfidf_vectorizer (1).pkl")

['tfidf_vectorizer (1).pkl']

In [None]:
import joblib
import numpy as np

class CodeReviewModel:
    def _init_(self, model_path="xgboost_code_review (1).pkl", vectorizer_path="tfidf_vectorizer (1).pkl"):
        self.model = joblib.load(model_path)
        self.vectorizer = joblib.load(vectorizer_path)

    def predict(self, code_snippet: str):
        # Vectorize the code
        X = self.vectorizer.transform([code_snippet])  # list of 1 item
        prediction = self.model.predict(X)
        return prediction[0]  # Return single prediction

In [None]:

class ReviewEngine:
    def _init_(self):
        self.model = CodeReviewModel()

    def review_code(self, code_text):
        prediction = self.model.predict(code_text)

        # This is where you generate a human-readable comment
        # Based on how you trained the model, adjust this logic
        if prediction == 1:
            return ["✅ Code looks good!"]
        else:
            return ["⚠ Potential issue detected in the code."]

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
#from app.review_engine import ReviewEngine

app = FastAPI()
engine = ReviewEngine()

class CodeRequest(BaseModel):
    code: str

@app.post("/review/")
def review(req: CodeRequest):
    comments = engine.review_code(req.code)
    return {"comments": comments}

In [None]:
!pip install fastapi uvicorn nest-asyncio pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
import nest_asyncio
from pyngrok import ngrok
import uvicorn

# Needed to allow nested async loops in Colab
nest_asyncio.apply()

# Define your FastAPI app
app = FastAPI()

# Example model
class CodeRequest(BaseModel):
    code: str

@app.post("/review/")
def review_code(req: CodeRequest):
    return {"comments": [f"📝 Review received for: {req.code[:30]}..."]}

In [None]:
!ngrok config add-authtoken 2vXfemIfXawnnUbKOGeneR3W2ne_5qiT3DcLcFo1ZjVcP7RjZ

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
public_url = ngrok.connect(8000)
print(f"🌐 Public FastAPI URL: {public_url}/docs")

🌐 Public FastAPI URL: NgrokTunnel: "https://0ea3-35-201-251-243.ngrok-free.app" -> "http://localhost:8000"/docs


In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
import nest_asyncio
import uvicorn
from pyngrok import ngrok
import threading

# Apply patch for Colab
nest_asyncio.apply()

# FastAPI app
app = FastAPI()

from fastapi.middleware.cors import CORSMiddleware

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # For development, allow all origins
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Pydantic model for input
class CodeInput(BaseModel):
    code: str

# API route
@app.post("/predict")
def predict(input: CodeInput):
    # Dummy logic — replace with ML model later
    return {"review": ["Use descriptive variable names", "Avoid global variables"]}

# Start ngrok tunnel
nest_asyncio.apply()
public_url = ngrok.connect(8000)
print(f"🌐 Public FastAPI URL: {public_url}/docs")

# Start FastAPI server
def run():
    uvicorn.run(app, host="0.0.0.0", port=8000)

thread = threading.Thread(target=run)
thread.start()


🌐 Public FastAPI URL: NgrokTunnel: "https://bc3a-35-201-251-243.ngrok-free.app" -> "http://localhost:8000"/docs


INFO:     Started server process [471]
INFO:     Waiting for application startup.


In [None]:
from IPython.core.display import display, HTML

display(HTML('''
<!DOCTYPE html>
<html>
<head>
  <title>AI Code Reviewer</title>
  <style>
    body { font-family: Arial, sans-serif; padding: 20px; }
    textarea { width: 100%; height: 200px; font-family: monospace; }
    button { padding: 10px 20px; margin-top: 10px; font-size: 16px; }
    pre { background-color: #f4f4f4; padding: 15px; border-radius: 5px; }
  </style>
</head>
<body>
  <h2>🔍 Enter Your Code:</h2>
  <textarea id="code" placeholder="def hello():\n    print('Hello world')"></textarea>
  <br>
  <button onclick="sendCode()">🚀 Analyze</button>

  <h3>🧠 Review Suggestions:</h3>
  <pre id="output">Output will appear here...</pre>

  <script>
    const apiUrl = "https://0ea3-35-201-251-243.ngrok-free.app/predict"; // 🔗 Replace with updated ngrok if needed

    async function sendCode() {
      const code = document.getElementById("code").value;
      document.getElementById("output").textContent = "⏳ Sending request...";

      try {
        const response = await fetch(apiUrl, {
          method: "POST",
          headers: {
            "Content-Type": "application/json"
          },
          body: JSON.stringify({ code: code })
        });

        if (!response.ok) {
          const err = await response.text();
          console.error("❌ Server Error:", err);
          document.getElementById("output").textContent = "❌ Server Error " + response.status + ":\n" + err;
        } else {
          const data = await response.json();
          console.log("✅ Response Received:", data);
          document.getElementById("output").textContent = JSON.stringify(data.review, null, 2);
        }
      } catch (error) {
        console.error("❌ JS Fetch Error:", error);
        document.getElementById("output").textContent = "❌ JS Error: " + error;
      }
    }
  </script>
</body>
</html>
'''))

In [None]:
def calci(a,b):
  s=a+b
  d=a-b
  return s,d

a=int(input("Enter a: "))
b=int(input("Enter b: "))
p=calci(a,b)

In [None]:
p
# Run FastAPI app
#uvicorn.run(app, host="0.0.0.0", port=8000)

NameError: name 'p' is not defined

In [None]:
pip install fastapi uvicorn

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.1-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.0-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading starlette-0.46.1-py3-none-any.whl (71 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uvicorn, starlette, fastapi
Successfully installed fastapi-0.115.12 starlette-0.46.1 uvicorn-0.34.0
