In [32]:
import pandas as pd
import numpy as np
import re, string
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, classification_report, roc_auc_score
)

import gensim.downloader as api
import tensorflow as tf
from tensorflow.keras import layers, Model, regularizers, callbacks

In [33]:
df = pd.read_csv("/kaggle/input/donors-choose/Preprocessed_DonorsChoose_dataset.csv", low_memory=False)
print(df.shape)
df.head(2)

(109248, 14)


Unnamed: 0,id,teacher_prefix,school_state,project_grade_category,project_subject_categories,project_subject_subcategories,teacher_number_of_previously_posted_projects,project_is_approved,price,quantity,cleaned_titles,cleaned_essays,cleaned_summary,isdigit_summary
0,p253737,mrs,in,grades_prek_2,literacy_language,esl_literacy,0,0,154.6,23,educational support english learners home,students english learners working english seco...,students_need_opportunities_practice_beginning...,0
1,p258326,mr,fl,grades_6_8,history_civics_health_sports,civics_government_teamsports,7,1,299.0,1,wanted projector hungry learners,students arrive school eager learn polite gene...,students_need_projector_help_viewing_education...,0


In [34]:
print(df.columns.tolist())

['id', 'teacher_prefix', 'school_state', 'project_grade_category', 'project_subject_categories', 'project_subject_subcategories', 'teacher_number_of_previously_posted_projects', 'project_is_approved', 'price', 'quantity', 'cleaned_titles', 'cleaned_essays', 'cleaned_summary', 'isdigit_summary']


In [35]:
# --- Target selection ---
if 'is_funded' in df.columns:
    y = df['is_funded'].astype(int)

elif 'project_is_approved' in df.columns:
    y = df['project_is_approved'].astype(int)

elif 'total_donations' in df.columns and 'goal' in df.columns:
    y = (df['total_donations'] >= df['goal']).astype(int)

else:
    raise ValueError("No valid target column found!")


# --- Text features selection ---
TEXT_COLS = [c for c in ['cleaned_titles', 'cleaned_summary', 'cleaned_essays'] if c in df.columns]

df['text'] = df[TEXT_COLS].fillna('').agg(' '.join, axis=1)

X_text = df['text']

In [36]:
def clean_text(s):
    s = s.lower()
    s = re.sub(r'<[^>]+>', ' ', s)  # remove HTML
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = re.sub(r'\d+', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

X_text = X_text.apply(clean_text)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [38]:
vectorizer = TfidfVectorizer(max_features=100000, ngram_range=(1,3))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf   = vectorizer.transform(X_val)
X_test_tfidf  = vectorizer.transform(X_test)

In [39]:
w2v = api.load('glove-wiki-gigaword-100')  # 100-dim
EMB_DIM = 100

def doc2vec_avg(doc):
    toks = doc.split()
    vecs = [w2v[w] for w in toks if w in w2v]
    return np.mean(vecs, axis=0) if vecs else np.zeros(EMB_DIM)

X_train_w2v = np.vstack([doc2vec_avg(d) for d in X_train])
X_val_w2v   = np.vstack([doc2vec_avg(d) for d in X_val])
X_test_w2v  = np.vstack([doc2vec_avg(d) for d in X_test])

In [40]:
lr = LogisticRegression(max_iter=200, n_jobs=-1)
lr.fit(X_train_tfidf, y_train)
val_pred_lr = lr.predict(X_val_tfidf)
print(classification_report(y_val, val_pred_lr))

              precision    recall  f1-score   support

           0       0.62      0.03      0.05      2647
           1       0.85      1.00      0.92     14833

    accuracy                           0.85     17480
   macro avg       0.74      0.51      0.49     17480
weighted avg       0.82      0.85      0.79     17480



In [41]:
svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)
val_pred_svm = svm.predict(X_val_tfidf)
print(classification_report(y_val, val_pred_svm))

              precision    recall  f1-score   support

           0       0.38      0.13      0.20      2647
           1       0.86      0.96      0.91     14833

    accuracy                           0.84     17480
   macro avg       0.62      0.55      0.55     17480
weighted avg       0.79      0.84      0.80     17480



In [42]:
svd = TruncatedSVD(n_components=300, random_state=42)
X_train_tfidf_svd = svd.fit_transform(X_train_tfidf)
X_val_tfidf_svd   = svd.transform(X_val_tfidf)
X_test_tfidf_svd  = svd.transform(X_test_tfidf)

In [43]:
input_tfidf = layers.Input(shape=(300,))
x1 = layers.Dense(128, activation='relu')(input_tfidf)

input_w2v = layers.Input(shape=(EMB_DIM,))
x2 = layers.Dense(64, activation='relu')(input_w2v)

x = layers.Concatenate()([x1, x2])
x = layers.Dense(64, activation='relu')(x)
output = layers.Dense(1, activation='sigmoid')(x)

model = Model(inputs=[input_tfidf, input_w2v], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(
    [X_train_tfidf_svd, X_train_w2v], y_train,
    validation_data=([X_val_tfidf_svd, X_val_w2v], y_val),
    epochs=5, batch_size=256
)

Epoch 1/5
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.8367 - loss: 0.4489 - val_accuracy: 0.8489 - val_loss: 0.3871
Epoch 2/5
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8497 - loss: 0.3837 - val_accuracy: 0.8498 - val_loss: 0.3841
Epoch 3/5
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8526 - loss: 0.3761 - val_accuracy: 0.8510 - val_loss: 0.3814
Epoch 4/5
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8530 - loss: 0.3739 - val_accuracy: 0.8516 - val_loss: 0.3793
Epoch 5/5
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8566 - loss: 0.3654 - val_accuracy: 0.8502 - val_loss: 0.3825


In [44]:
val_pred_nn = (model.predict([X_val_tfidf_svd, X_val_w2v]) >= 0.5).astype(int)
print(classification_report(y_val, val_pred_nn))

[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
              precision    recall  f1-score   support

           0       0.58      0.04      0.07      2647
           1       0.85      1.00      0.92     14833

    accuracy                           0.85     17480
   macro avg       0.72      0.52      0.50     17480
weighted avg       0.81      0.85      0.79     17480



In [45]:
test_pred_nn = (model.predict([X_test_tfidf_svd, X_test_w2v]) >= 0.5).astype(int)
print(classification_report(y_test, test_pred_nn))

[1m683/683[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
              precision    recall  f1-score   support

           0       0.57      0.04      0.07      3308
           1       0.85      1.00      0.92     18542

    accuracy                           0.85     21850
   macro avg       0.71      0.52      0.49     21850
weighted avg       0.81      0.85      0.79     21850

