# JyotishAI – Full Model + Accuracy + 5 Plots

**Uses your existing files:**
- `data/clean/clean_kundali.csv` (already cleaned)
- Saves model → `model/jyotish_model.pkl`
- 5 plots → `assets/plots/`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter

sns.set(style='whitegrid')
os.makedirs('model', exist_ok=True)
print('Ready!')

In [None]:
df = pd.read_csv('data/clean/clean_kundali.csv')
print(f'Loaded {len(df)} clean rows')
df.head()

In [None]:
df['input'] = df['lagna'] + ' ' + df['sun'] + ' ' + df['moon'] + ' ' + df['question']
X = df['input']
y = df['prediction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Train: {len(X_train)}, Test: {len(X_test)}')

In [None]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1,2))),
    ('clf', MultinomialNB())
])

print('Training...')
model.fit(X_train, y_train)
print('Done!')

In [None]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'ACCURACY: {acc*100:.2f}%')
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
joblib.dump(model, 'model/jyotish_model.pkl')
print('Model saved!')

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x=['Model'], y=[acc], palette='Blues_d')
plt.ylim(0,1)
plt.title(f'Model Accuracy: {acc*100:.1f}%')
plt.ylabel('Accuracy')
plt.savefig('assets/plots/accuracy.png')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=df, x='question', order=df['question'].value_counts().index, palette='viridis')
plt.title('Question Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('assets/plots/question_freq.png')
plt.show()

In [None]:
top_lagna = df['lagna'].value_counts().head(6).index
plt.figure(figsize=(11,6))
sns.countplot(data=df[df['lagna'].isin(top_lagna)], x='lagna', hue='question', palette='Set2')
plt.title('Top 6 Lagna vs Question')
plt.legend(bbox_to_anchor=(1.05,1))
plt.tight_layout()
plt.savefig('assets/plots/lagna_vs_question.png')
plt.show()

In [None]:
df['year'] = pd.to_datetime(df['birth_date']).dt.year
plt.figure(figsize=(9,5))
sns.countplot(data=df, x='year', palette='rocket')
plt.title('Birth Year Trend')
plt.tight_layout()
plt.savefig('assets/plots/birth_year.png')
plt.show()

In [None]:
words = ' '.join(df['prediction']).lower().split()
common = Counter(words).most_common(10)
w, c = zip(*common)

plt.figure(figsize=(10,5))
sns.barplot(x=list(c), y=list(w), palette='magma')
plt.title('Top 10 Prediction Keywords')
plt.xlabel('Count')
plt.tight_layout()
plt.savefig('assets/plots/word_cloud.png')
plt.show()

## FYP READY!

- Model: `model/jyotish_model.pkl`
- Accuracy: Shown
- 5 Plots: In `assets/plots/`