In [1]:
import os
import re
import string
import pandas as pd
import PyPDF2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sentence_transformers import SentenceTransformer
import textstat
import shap
import numpy as np

In [2]:

def explain_prediction(index, shap_vals, X_data, y_true, y_pred, class_names, feature_names):
    print(f"\n📄 Sample {index + 1}:")
    predicted = class_names[y_pred[index]]
#     actual = class_names[y_true.iloc[index]]
    print(f"Predicted: {predicted}")
    
    # Get SHAP values for class 1 (Publishable)
    sample_shap = shap_vals[1][index]
    sample_features = X_data.iloc[index]
    
    # Get top contributing features by absolute SHAP value
    top_idx = np.argsort(np.abs(sample_shap))[::-1]
    
    print("Top factors contributing to this decision:")
    for i in top_idx[:5]:  # Top 5 features
        direction = "+" if sample_shap[i] > 0 else "-"
        print(f"  - {feature_names[i]}: {direction}{abs(sample_shap[i]):.3f}")

    print("\n📝 Explanation:")
    if y_pred[index] == 1:
        print("This paper is considered *Publishable* because it performs well on features like:")
        for i in top_idx[:3]:
            if sample_shap[i] > 0:
                print(f"  • {feature_names[i]} (positive impact)")
        print("despite some weaker areas like:")
        for i in top_idx[:3]:
            if sample_shap[i] < 0:
                print(f"  • {feature_names[i]} (negative impact)")
    else:
        print("This paper is considered *Non-Publishable* mainly due to:")
        for i in top_idx[:3]:
            if sample_shap[i] < 0:
                print(f"  • {feature_names[i]} (negative impact)")
        print("even though it had some positive points like:")
        for i in top_idx[:3]:
            if sample_shap[i] > 0:
                print(f"  • {feature_names[i]} (positive impact)")

In [3]:
# ----------- TEXT EXTRACTION ----------
def extract_text(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                text += page.extract_text() or ""
        return text
    except:
        return None

In [4]:
def preprocess_text(text):

    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

In [5]:


def get_sections(text):
    text_lower = text.lower()
    
    # Convert text to lowercase for comparison
    lines = text.splitlines()
    
    intro_start = -1
    intro_end = -1
    concl_start = -1
    concl_end = -1

    #"Introduction" starts
    for i, line in enumerate(lines):
        if 'introduction' in line.lower():
            intro_start = i
            break

    #"Methodology" or "Methods" starts (end of intro)
    for i in range(intro_start + 1, len(lines)):
        if 'methodology' in lines[i].lower() or 'methods' in lines[i].lower():
            intro_end = i
            break

    #"Conclusion", "Results", or "Discussion" starts
    for i, line in enumerate(lines):
        if 'conclusion' in line.lower() or 'results' in line.lower() or 'discussion' in line.lower():
            concl_start = i
            break

    #"References" or "Acknowledgments" starts (end of conclusion)
    for i in range(concl_start + 1, len(lines)):
        if 'references' in lines[i].lower() or 'acknowledgments' in lines[i].lower():
            concl_end = i
            break

    # Extract sections using line indices
    
    if intro_start != -1 and intro_end != -1:
        intro_section = "\n".join(lines[intro_start + 1:intro_end])
    else:
        intro_section = ""

    if concl_start != -1 and concl_end != -1:
        concl_section = "\n".join(lines[concl_start + 1:concl_end])
    else:
        concl_section = ""

    return intro_section, concl_section
    



In [6]:


def get_citation_features(text):

    # Count numbered citations like [1], [2,3], etc. 
    
    numbered_citations = 0
    for i in range(len(text)):
        if text[i] == '[':
            j = i + 1
            while j < len(text) and text[j] != ']':
                j += 1
            if j < len(text) and text[j] == ']':
                inside = text[i+1:j]
                if any(char.isdigit() for char in inside):
                    numbered_citations += 1

                    
                    
                    
    #   Count author-based citations like (Smith et al., 2020)
    
    
    author_citations = 0
    i = 0
    while i < len(text):
        if text[i] == '(':
            j = i + 1
            while j < len(text) and text[j] != ')':
                j += 1
            if j < len(text):
                content = text[i+1:j]
                if 'et al.' in content.lower() and '20' in content:
                    author_citations += 1
            i = j
        i += 1

    total_citations = numbered_citations + author_citations
    
    
    
    

    #   Find and count references section 
    
    text_lower = text.lower()
    start_index = -1
    if "references" in text_lower:
        start_index = text_lower.index("references")
    elif "bibliography" in text_lower:
        start_index = text_lower.index("bibliography")

    references_text = ""
    if start_index != -1:
        references_text = text[start_index:]
        
        
        
    
    # Count number of reference entries (non-empty lines)
    
    
    reference_lines = references_text.split('\n')
    reference_count = 0
    for line in reference_lines:
        if line.strip():  # only count non-empty lines
            reference_count += 1
            
            

    #  Calculate citation density per 1000 words ----
    
    
    words = text.split()
    total_words = len(words)
    if total_words > 0:
        density = (total_citations / total_words) * 1000
        density = round(density, 2)
    else:
        density = 0.0

    return total_citations, reference_count, density


In [7]:
#        IMPROVEMENT DETECTION FEATURE
comparison_phrases = [
    'better than', 'outperforms', 'compared to', 'improves upon',
    'achieves higher', 'higher accuracy', 'lower error', 'state-of-the-art',
    'compared with', 'previous methods', 'surpasses', 'yields better results'
]

In [8]:
def check_improvement_statements(text):
    text = text.lower()
    return int(any(phrase in text for phrase in comparison_phrases))

In [9]:
data = []
model = SentenceTransformer('all-MiniLM-L6-v2')
for label, folder in [('Publishable', 'Publishable'), ('Non-Publishable', 'Non-Publishable')]:
    for file in os.listdir(folder):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(folder, file)
            text = extract_text(pdf_path)
            if not text:
                continue
            
            
            
            # Section extraction
            
            intro, concl = get_sections(text)
            if intro and concl:
                intro_embedding = model.encode(intro)
                concl_embedding = model.encode(concl)
                similarity = np.dot(intro_embedding, concl_embedding)
            else:
                similarity = 0
                
                
                
                
            # Readability
            
            cleaned_text = preprocess_text(text)
            flesch_kincaid = textstat.flesch_kincaid_grade(text)
            dale_chall = textstat.dale_chall_readability_score(text)
            ari = textstat.automated_readability_index(text)
            word_count = len(cleaned_text.split())
            
            
            
            # Citation features
            
            in_text_cit, ref_count, cit_density = get_citation_features(text)
            
            
            
            # Improvement detection
            
            improvement_flag = check_improvement_statements(concl)
            
            
            
            data.append({
                'similarity': similarity,
                'flesch_kincaid': flesch_kincaid,
                'dale_chall': dale_chall,
                'ari': ari,
                'word_count': word_count,
                'in_text_cit': in_text_cit,
                'ref_count': ref_count,
                'cit_density': cit_density,
                'improvement_flag': improvement_flag,
                'label': 1 if label == 'Publishable' else 0
            })

In [10]:
df = pd.DataFrame(data)

In [11]:
# Feature set
features = ['similarity', 'flesch_kincaid', 'dale_chall', 'ari',
            'word_count', 'in_text_cit', 'ref_count', 'cit_density',
            'improvement_flag']
X = df[features]
y = df['label']



# Train/test split & model


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)




# Evaluation



y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")

Accuracy: 1.00
F1 Score: 1.00


In [12]:
# print("\nFeature Importances:")
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': clf.feature_importances_
}).sort_values(by="Importance", ascending=False)
# print(importance_df)

In [13]:
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X)

# Choose the class index: 1 = Publishable, 0 = Non-Publishable
class_names = ['Non-Publishable', 'Publishable']
feature_names = X.columns.tolist()
y_pred = clf.predict(X)
for i in range(3):
    explain_prediction(i, shap_values, X, y, y_pred, class_names, feature_names)


📄 Sample 1:
Predicted: Publishable
Top factors contributing to this decision:
  - flesch_kincaid: -0.000
  - similarity: -0.000

📝 Explanation:
This paper is considered *Publishable* because it performs well on features like:
despite some weaker areas like:

📄 Sample 2:
Predicted: Publishable
Top factors contributing to this decision:
  - flesch_kincaid: +0.127
  - similarity: -0.127

📝 Explanation:
This paper is considered *Publishable* because it performs well on features like:
  • flesch_kincaid (positive impact)
despite some weaker areas like:
  • similarity (negative impact)

📄 Sample 3:
Predicted: Publishable
Top factors contributing to this decision:
  - flesch_kincaid: +0.019
  - similarity: -0.019

📝 Explanation:
This paper is considered *Publishable* because it performs well on features like:
  • flesch_kincaid (positive impact)
despite some weaker areas like:
  • similarity (negative impact)


In [14]:
def explain_unlabeled(row, clf, feature_means, prediction):
    reasons = []
    
    # If the prediction is "Publishable" (1), explain positive contributing factors
    if prediction == 1:
        if row['similarity'] > feature_means['similarity']:
            reasons.append("high similarity between Introduction and Conclusion")
        if row['ref_count'] > feature_means['ref_count']:
            reasons.append("strong reference section")
        if row['cit_density'] > feature_means['cit_density']:
            reasons.append("good citation density")
        if row['improvement_flag'] == 1:
            reasons.append("mentions improvement over previous work")
        if row['flesch_kincaid'] < feature_means['flesch_kincaid']:
            reasons.append("uses easy-to-understand language")
        if row['dale_chall'] < feature_means['dale_chall']:
            reasons.append("clear readability (low Dale-Chall score)")

    # If the prediction is "Non-Publishable" (0), explain negative contributing factors
    else:
        if row['similarity'] <= feature_means['similarity']:
            reasons.append("low similarity between Introduction and Conclusion")
        if row['ref_count'] <= feature_means['ref_count']:
            reasons.append("weak reference section")
        if row['cit_density'] <= feature_means['cit_density']:
            reasons.append("low citation density")
        if row['improvement_flag'] == 0:
            reasons.append("does not mention improvement over previous work")
        if row['flesch_kincaid'] > feature_means['flesch_kincaid']:
            reasons.append("uses complex language (high Flesch-Kincaid grade)")
        if row['dale_chall'] > feature_means['dale_chall']:
            reasons.append("difficult readability (high Dale-Chall score)")

    # Return a joined string of reasons
    return "; ".join(reasons) if reasons else "general poor quality across features" if prediction == 0 else "general good quality across features"

In [15]:
def main():
    data = []
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Load labeled data (Publishable, Non-Publishable)
    for label, folder in [('Publishable', 'Publishable'), ('Non-Publishable', 'Non-Publishable')]:
        for file in os.listdir(folder):
            if file.endswith('.pdf'):
                pdf_path = os.path.join(folder, file)
                text = extract_text(pdf_path)
                if not text:
                    continue
                
                # Section extraction
                intro, concl = get_sections(text)
                similarity = model.encode(intro) @ model.encode(concl).T if intro and concl else 0

                # Readability
                cleaned_text = text  # Assuming cleaned text is just the extracted text here
                flesch_kincaid = textstat.flesch_kincaid_grade(text)
                dale_chall = textstat.dale_chall_readability_score(text)
                ari = textstat.automated_readability_index(text)
                word_count = len(cleaned_text.split())

                # Citation features
                in_text_cit, ref_count, cit_density = get_citation_features(text)

                # Improvement detection
                improvement_flag = check_improvement_statements(concl)

                data.append({
                    'similarity': similarity,
                    'flesch_kincaid': flesch_kincaid,
                    'dale_chall': dale_chall,
                    'ari': ari,
                    'word_count': word_count,
                    'in_text_cit': in_text_cit,
                    'ref_count': ref_count,
                    'cit_density': cit_density,
                    'improvement_flag': improvement_flag,
                    'label': 1 if label == 'Publishable' else 0,
                    'pdf_name': file
                })

    df = pd.DataFrame(data)

    # Feature set
    features = ['similarity', 'flesch_kincaid', 'dale_chall', 'ari',
                'word_count', 'in_text_cit', 'ref_count', 'cit_density',
                'improvement_flag']
    X = df[features]
    y = df['label']

    # Train/test split & model
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Calculate feature means for explanation (after training)
    feature_means = X_train.mean()

    y_pred = clf.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")

    # ----------- UNLABELED PREDICTIONS ----------
    unlabeled_data = []
    unlabeled_folder = 'Unlabeled'
    for file in os.listdir(unlabeled_folder):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(unlabeled_folder, file)
            text = extract_text(pdf_path)
            if not text:
                continue
            
            # Section extraction
            intro, concl = get_sections(text)
            similarity = model.encode(intro) @ model.encode(concl).T if intro and concl else 0

            # Readability
            cleaned_text = text  # Assuming cleaned text is just the extracted text here
            flesch_kincaid = textstat.flesch_kincaid_grade(text)
            dale_chall = textstat.dale_chall_readability_score(text)
            ari = textstat.automated_readability_index(text)
            word_count = len(cleaned_text.split())

            # Citation features
            in_text_cit, ref_count, cit_density = get_citation_features(text)

            # Improvement detection
            improvement_flag = check_improvement_statements(concl)

            unlabeled_data.append({
                'pdf_name': file,
                'similarity': similarity,
                'flesch_kincaid': flesch_kincaid,
                'dale_chall': dale_chall,
                'ari': ari,
                'word_count': word_count,
                'in_text_cit': in_text_cit,
                'ref_count': ref_count,
                'cit_density': cit_density,
                'improvement_flag': improvement_flag
            })

    unlabeled_df = pd.DataFrame(unlabeled_data)

    # Predict unlabeled data
    unlabeled_predictions = clf.predict(unlabeled_df[features])

    # ----------- EXPLAIN PREDICTIONS ----------
    unlabeled_df['Prediction'] = unlabeled_predictions
    unlabeled_df['Reason'] = unlabeled_df.apply(
        lambda row: explain_unlabeled(row, clf, feature_means, row['Prediction']), axis=1
    )

    # Filter out unnecessary columns and retain only Name, Prediction, and Reason
    final_df = unlabeled_df[['pdf_name', 'Prediction', 'Reason']]

    # Save to CSV
    final_df.to_csv('unlabeled_predictions.csv', index=False)
    print("\nUnlabeled predictions saved to 'unlabeled_predictions.csv'")

if __name__ == "__main__":
    main()

Accuracy: 1.00
F1 Score: 1.00

Unlabeled predictions saved to 'unlabeled_predictions.csv'


In [16]:
# !pip install numpy==1.24.4 numba==0.57.1
