In [30]:
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack, csr_matrix

df = pd.read_excel('hpcleanvlarge1.xlsx')
print("Columns in the dataset:", df.columns.tolist())
display(df.head())

Columns in the dataset: ['Chapters', 'Favs', 'Follows', 'Published', 'Reviews', 'Updated', 'Words', 'author', 'characters', 'genre', 'language', 'rating', 'story_link', 'synopsis', 'title', 'published_mmyy', 'pairing']


Unnamed: 0,Chapters,Favs,Follows,Published,Reviews,Updated,Words,author,characters,genre,language,rating,story_link,synopsis,title,published_mmyy,pairing
0,1,2.0,,12/31/2019,1.0,,6840,reviews,"Sirius B., Remus L., James P., Regulus B.",Angst/Hurt/Comfort,English,T,https://www.fanfiction.net/s/13466909/1/If-You...,Regulus and James aren't happy. They know they...,If You Change Your Mind,2019-12-01,
1,1,1.0,,12/31/2019,,,10962,JoyI9199,"Harry P., Draco M., Narcissa M., Charlie W.",Angst/Drama,English,M,https://www.fanfiction.net/s/13466894/1/Bloody...,When a plot from the Founder's age is revealed...,Bloody Ballgowns,2019-12-01,
2,1,3.0,2.0,12/31/2019,,,8592,MoonytheMarauder1,"[James P., Regulus B.]",Angst,English,M,https://www.fanfiction.net/s/13466885/1/Nothin...,"Regulus Black is supposed to be dead, but he's...",Nothing Left To Do,2019-12-01,"James P., Regulus B."
3,2,,,12/31/2019,,,7260,LaviniaKatt,Cedric D.,Romance/Fantasy,English,M,https://www.fanfiction.net/s/13466880/1/Patien...,This is a spin off of Harry Potter taking plac...,Patience is a Virtue,2019-12-01,
4,1,4.0,3.0,12/31/2019,,,1529,Rowena-Moon-Moon,,,English,T,https://www.fanfiction.net/s/13466807/1/An-Und...,Harry makes a new discovery and perhaps a few ...,An Understanding,2019-12-01,


In [31]:
df['combined_text'] = (df['title'].fillna('').astype(str) + " " + df['synopsis'].fillna('').astype(str)).str.lower()

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
text_features = vectorizer.fit_transform(df['combined_text'])


numeric_cols = ['Chapters', 'Favs', 'Words']
df[numeric_cols] = df[numeric_cols].fillna(0)
scaler = StandardScaler()
numeric_features = scaler.fit_transform(df[numeric_cols])
numeric_features_sparse = csr_matrix(numeric_features)

combined_features = hstack([text_features, numeric_features_sparse])

nn_model = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute')
nn_model.fit(combined_features)

query_stories = [
    {"title": "Manacled", "author": "SenLinYu"},
    {"title": "Isolation", "author": "Bex-chan"},
    {"title": "Harry Potter and the Rune Stone Path", "author": "Temporal Knight"}
]

def get_index_from_query(query):
    title = query["title"].strip().lower()
    author = query["author"].strip().lower()
    filtered = df[
    (df['title'].astype(str).str.strip().str.lower() == title) &
    (df['author'].astype(str).str.strip().str.lower() == author)
    ]
    if filtered.empty:
        print(f"Query story '{query['title']} by {query['author']}' not found in the dataset.")
        return None
    return filtered.index[0]

for query in query_stories:
    idx = get_index_from_query(query)
    if idx is None:
        continue
    
    distances, indices = nn_model.kneighbors(combined_features[idx])
    neighbor_indices = indices[0][1:]
    neighbor_distances = distances[0][1:]
    
    similarity_scores = [1 - d for d in neighbor_distances]
    
    similar_stories = df.iloc[neighbor_indices][['title', 'author']].copy()
    similar_stories['similarity'] = similarity_scores
    
    print(f"\nTop 10 stories similar to '{query['title']} by {query['author']}':")
    display(similar_stories.reset_index(drop=True))



Top 10 stories similar to 'Manacled by SenLinYu':


Unnamed: 0,title,author,similarity
0,Crimson with a Silver Lining,Lady Cailan,0.995592
1,Maybe I'm Amazed,Alethea27,0.995011
2,Freedom And Not Peace,Lightning on the Wave,0.994591
3,Questions and Answers,little0bird,0.993904
4,One Step at a Time,IcyPanther,0.993819
5,Looking Beyond,shini-amaryllis,0.993652
6,UnVeiled,Snapegirlkmf,0.993493
7,Morphed Secrets,nightkitty555,0.993312
8,How to tame a Marauder,melian225,0.993238
9,Take The Tumble,Kittenshift17,0.993226



Top 10 stories similar to 'Isolation by Bex-chan':


Unnamed: 0,title,author,similarity
0,To Shape and Change,Blueowl,0.999813
1,His Own Man,Crunchysunrises,0.999698
2,Wind Shear,Chilord,0.99957
3,A Wonderful Caricature of Intimacy,Countess of Abe,0.999548
4,Make A Wish,Rorschach's Blot,0.999535
5,The Thief of Hogwarts,bluminous8,0.999499
6,No Hurry At All,RobSt,0.999453
7,Harry Potter and the Champion's Champion,DriftWood1965,0.999401
8,Oh God Not Again!,Sarah1281,0.999379
9,A Different Halloween,RobSt,0.999359



Top 10 stories similar to 'Harry Potter and the Rune Stone Path by Temporal Knight':


Unnamed: 0,title,author,similarity
0,The Legacy Preservation Act,James Spookie,0.999172
1,Harry Potter and the Four Heirs,Sinyk,0.999057
2,Broken,inadaze22,0.998992
3,What We're Fighting For,James Spookie,0.998969
4,Harry Potter and the Game,Concept101,0.998835
5,Luna's Hubby,Meteoricshipyards,0.998656
6,Harry Potter and the Hero's Path,TheJackOfDiamonds,0.998592
7,Harry Crow,RobSt,0.998584
8,Dodging Prison and Stealing Witches - Revenge ...,LeadVonE,0.998563
9,Honour Thy Blood,TheBlack'sResurgence,0.998537
