In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
dataset_path = "../dataset/TVTropesData/tropes.csv"

In [3]:
tropes_raw = pd.read_csv(dataset_path, index_col=0, dtype={"TropeID": str, "Trope": str, "Description": str})

In [4]:
tropes_raw

Unnamed: 0,TropeID,Trope,Description
0,t00001,AbandonedArea,\nAbandoned places make good settings for fict...
1,t00002,AbandonedCatchphrase,Catchphrases are a great and simple way to hel...
2,t00003,AbandonedHospital,The creepy abandoned hospital/mental instituti...
3,t00004,AbandonedHospitalAwakening,"An Abandoned Hospital Awakening is, as the nam..."
4,t00005,AbandonedInfoPage,When a work is getting more and more complicat...
...,...,...,...
30979,t30980,FlowersInHerHair,\nFlowers have long been a symbol of femininit...
30980,t30981,KicktheSonOfABitch,\nThere's nothing quite so evil as when a char...
30981,t30982,ParentProducedProject,"When a child needs a school project done, some..."
30982,t30983,CutAwayGag,\nA Cutaway Gag is a joke generally found in s...


In [6]:
tropes_raw.Description.isnull().sum()

24

In [7]:
tropes_raw["Description"] = tropes_raw["Description"].str.strip(' \t\n\r')

In [8]:
tropes_raw.Description.describe()

count                                                 30960
unique                                                28130
top       Lampshade Hanging (or, more informally, "Lamps...
freq                                                     10
Name: Description, dtype: object

In [9]:
tropes_raw.Trope.describe()

count             30984
unique            30984
top       AbandonedArea
freq                  1
Name: Trope, dtype: object

In [10]:
tropes_raw.TropeID.describe()

count      30984
unique     30984
top       t00001
freq           1
Name: TropeID, dtype: object

In [11]:
tropes_raw[tropes_raw["Description"].duplicated(keep=False)]

Unnamed: 0,TropeID,Trope,Description
27,t00028,ABNegative,Although most of the real world gets by quite ...
38,t00039,ABoyAndHisX,A heartwarming story told through the ages: So...
69,t00070,AbsurdlySharpBlade,An absurdly sharp blade is a bladed weapon wit...
72,t00073,AbsurdlyYouthfulMother,When you subtract the age (or apparent age) of...
78,t00079,AbusiveParents,Parents are supposed to be the protectors of c...
...,...,...,...
30976,t30977,AntiPoopsocking,"""Poopsocking"" is when a video gamer is so engr..."
30978,t30979,SexySchoolwoman,The widely held belief that girls who attend R...
30980,t30981,KicktheSonOfABitch,There's nothing quite so evil as when a charac...
30982,t30983,CutAwayGag,A Cutaway Gag is a joke generally found in sil...


In [12]:
tropes_raw[tropes_raw["Description"].duplicated(keep=False) == True]

Unnamed: 0,TropeID,Trope,Description
27,t00028,ABNegative,Although most of the real world gets by quite ...
38,t00039,ABoyAndHisX,A heartwarming story told through the ages: So...
69,t00070,AbsurdlySharpBlade,An absurdly sharp blade is a bladed weapon wit...
72,t00073,AbsurdlyYouthfulMother,When you subtract the age (or apparent age) of...
78,t00079,AbusiveParents,Parents are supposed to be the protectors of c...
...,...,...,...
30976,t30977,AntiPoopsocking,"""Poopsocking"" is when a video gamer is so engr..."
30978,t30979,SexySchoolwoman,The widely held belief that girls who attend R...
30980,t30981,KicktheSonOfABitch,There's nothing quite so evil as when a charac...
30982,t30983,CutAwayGag,A Cutaway Gag is a joke generally found in sil...


In [13]:
tropes_raw['Description'].duplicated(keep=False)

0        False
1        False
2        False
3        False
4        False
         ...  
30979    False
30980     True
30981    False
30982     True
30983     True
Name: Description, Length: 30984, dtype: bool

In [14]:
trope_dict = pd.Series(tropes_raw.TropeID.values,index=tropes_raw.Trope).to_dict()

In [15]:
lit_tropes =  "../dataset/TVTropesData/lit_tropes.csv"
lit_goodread_tropes =  "../dataset/TVTropesData/lit_goodreads_match.csv"

In [16]:
lit_df = pd.read_csv(lit_tropes, index_col=0)
goodreads_df = pd.read_csv(lit_goodread_tropes, index_col=0)

In [17]:
lit_df

Unnamed: 0,Title,Trope,Example,trope_id,title_id
0,ABadCaseOfStripes,StockYuck,"Lima beans:\n On The Simpsons, when Marge is ...",t21535,lit0
1,ABadCaseOfStripes,InvoluntaryShapeshifting,A Bad Case Of Stripes focuses on a girl named...,t27289,lit0
2,ABadCaseOfStripes,InvoluntaryShapeShifting,A Bad Case Of Stripes focuses on a girl named...,t28603,lit0
3,ABadCaseOfStripes,OneOfTheseIsNotLikeTheOthers,"In A Bad Case Of Stripes, the Specialists det...",t28668,lit0
4,ABadCaseOfStripes,LabCoatOfScienceAndMedicine,"In A Bad Case Of Stripes, the five doctors al...",t29826,lit0
...,...,...,...,...,...
685045,ZuleikaDobson,YouCantFightFate,By the time the Duke thinks better of his pla...,t26457,lit15494
685046,ZuleikaDobson,KillEmAll,"Every single undergraduate in Oxford, dead.",t12334,lit15494
685047,ZuleikaDobson,SeriousBusiness,Drowning yourself for love of a Femme Fatale...,t19895,lit15494
685048,ZuleikaDobson,FemmeFatale,The title character of Max Beerbohm's Black C...,t07883,lit15494


In [18]:
# get the most frequent trope ids in both dfs
# choose a subset from that to query LLaMA

In [19]:
lit_df['trope_id'].value_counts().nlargest(750)

t20214    1431
t13853    1103
t29575    1058
t02187     953
t02336     788
          ... 
t22985     122
t05858     122
t01931     122
t17941     122
t08657     122
Name: trope_id, Length: 750, dtype: int64

In [20]:
goodreads_df['trope_id'].value_counts().nlargest(750)

t20214    688
t13853    643
t29575    557
t03644    467
t02336    462
         ... 
t12458     64
t12691     64
t03737     64
t08111     64
t08657     64
Name: trope_id, Length: 750, dtype: int64

In [60]:
common_tropes = set(lit_df['trope_id'].value_counts().nlargest(462).index.tolist()) & set(goodreads_df['trope_id'].value_counts().nlargest(462).index.tolist())

In [61]:
# s1 = lit_df['trope_id'].value_counts().nlargest(50).index.tolist()
# s2 = goodreads_df['trope_id'].value_counts().nlargest(50).index.tolist()
# s1[s1.isin(s2)]

In [62]:
len(list(common_tropes))

400

In [22]:
lit_df['trope_id'].value_counts().nsmallest(50)

t04206    1
t20547    1
t10877    1
t30329    1
t21088    1
t09165    1
t28401    1
t25400    1
t07807    1
t25170    1
t08541    1
t16307    1
t13284    1
t08446    1
t14063    1
t02998    1
t07943    1
t21137    1
t23918    1
t30343    1
t13948    1
t11795    1
t20096    1
t02969    1
t27802    1
t02603    1
t23333    1
t23917    1
t23583    1
t19219    1
t20635    1
t06723    1
t07215    1
t09211    1
t18164    1
t26123    1
t18389    1
t22190    1
t21418    1
t16395    1
t19631    1
t30719    1
t05166    1
t23024    1
t17251    1
t16200    1
t12276    1
t04110    1
t15877    1
t22116    1
Name: trope_id, dtype: int64

In [23]:
goodreads_df['trope_id'].value_counts().nsmallest(50)

t21404    1
t11195    1
t18515    1
t28631    1
t15160    1
t02140    1
t13039    1
t03108    1
t28489    1
t20284    1
t12611    1
t10026    1
t27678    1
t01132    1
t21542    1
t13028    1
t02081    1
t10498    1
t10566    1
t05579    1
t25583    1
t30634    1
t25788    1
t22017    1
t30282    1
t03157    1
t08880    1
t28838    1
t16279    1
t22589    1
t09950    1
t13040    1
t27586    1
t18006    1
t23782    1
t03551    1
t14920    1
t17093    1
t15459    1
t01116    1
t11684    1
t15200    1
t26369    1
t17749    1
t08707    1
t08688    1
t08755    1
t16911    1
t12735    1
t30759    1
Name: trope_id, dtype: int64

In [69]:
lit_df['trope_id'].value_counts().sort_values(ascending=True).head(20000).tail(51) #get the last 6 from above

t05412    27
t02925    27
t26803    27
t09608    27
t00499    27
t22113    27
t26216    27
t06512    27
t23491    27
t24005    27
t30000    27
t23221    27
t13668    27
t14852    27
t26326    27
t09074    27
t30487    27
t28705    27
t05730    27
t26889    27
t14739    27
t19267    27
t24870    28
t02654    28
t24584    28
t17690    28
t13615    28
t27591    28
t03861    28
t21533    28
t04128    28
t07156    28
t01213    28
t06857    28
t22111    28
t28332    28
t21776    28
t23309    28
t24860    28
t20743    28
t09177    28
t12269    28
t15405    28
t28656    28
t28470    28
t10868    28
t03973    28
t28573    28
t07437    28
t06016    28
t00510    28
Name: trope_id, dtype: int64

In [25]:
goodreads_df['trope_id'].value_counts().sort_values(ascending=True).head(20000).tail(6)

t23253    20
t18160    20
t28820    20
t09746    20
t28281    20
t15955    20
Name: trope_id, dtype: int64

In [40]:
type(common_tropes)

set

In [70]:
less_common_tropes = set(lit_df['trope_id'].value_counts().sort_values(ascending=True).head(20000).tail(51).index.tolist())

In [71]:
less_common_tropes.update(tuple(goodreads_df['trope_id'].value_counts().sort_values(ascending=True).head(20000).tail(51).index.tolist()))
less_common_tropes

{'t00499',
 't00510',
 't00522',
 't00974',
 't01213',
 't01342',
 't01885',
 't02003',
 't02654',
 't02925',
 't03687',
 't03861',
 't03973',
 't04128',
 't05412',
 't05730',
 't06016',
 't06490',
 't06512',
 't06811',
 't06857',
 't07151',
 't07156',
 't07437',
 't07474',
 't08047',
 't08350',
 't08765',
 't09074',
 't09177',
 't09608',
 't09746',
 't09987',
 't10066',
 't10596',
 't10868',
 't11481',
 't11802',
 't12269',
 't12751',
 't12818',
 't13458',
 't13615',
 't13668',
 't14739',
 't14852',
 't15405',
 't15494',
 't15955',
 't16009',
 't17675',
 't17690',
 't18160',
 't18448',
 't18574',
 't18589',
 't18665',
 't19267',
 't20374',
 't20743',
 't21293',
 't21384',
 't21533',
 't21776',
 't21795',
 't22111',
 't22113',
 't22938',
 't23221',
 't23253',
 't23309',
 't23491',
 't24005',
 't24251',
 't24584',
 't24860',
 't24870',
 't26182',
 't26216',
 't26324',
 't26326',
 't26375',
 't26803',
 't26889',
 't27188',
 't27277',
 't27331',
 't27591',
 't27746',
 't27763',
 't28281',

In [72]:
tropes = common_tropes.union(less_common_tropes)

In [73]:
len(list(tropes))

500

In [74]:
tropes_raw['TropeID'].isin(tropes).value_counts()

False    30484
True       500
Name: TropeID, dtype: int64

In [79]:
tropes_raw[tropes_raw['TropeID'].isin(tropes)].to_csv('my_tropes.csv', index=False, index_label="TropeID")

In [76]:
tropes_raw[tropes_raw['TropeID'].isin(tropes)]

Unnamed: 0,TropeID,Trope,Description
78,t00079,AbusiveParents,Parents are supposed to be the protectors of c...
189,t00190,ActionGirl,An Action Girl is a female badass who is tough...
262,t00263,AdaptationDistillation,Some adaptations take a complex character or s...
264,t00265,AdaptationExpansion,This is the complete opposite of Compressed Ad...
269,t00270,AdaptationNameChange,Alice Andrews in the books becomes Alice Allen...
...,...,...,...
29999,t30000,Redshirts,This is the Good Counterpart of Evil Minions a...
30219,t30220,Bioaugmentation,The biological counterpart to Cyborging: inste...
30285,t30286,XenoFiction,A few works of Science Fiction and Fantasy (an...
30469,t30470,RecountedByTheMainCharacters,"Sometimes, ""this is a true story"" is part of t..."
