In [2]:
import pandas as pd
import requests
import datetime
import json
import lxml
import altair as alt


In [4]:
# !pip install jsonschema
# !pip install altair

#### Graph the total number of words in the crossword over time

In [3]:
df = pd.read_csv("crossword_allyears.csv")
df['day_of_week'] = pd.to_datetime(df['crossword_date'], errors='coerce').dt.day_name()
df

Unnamed: 0.1,Unnamed: 0,crossword_date,answers,clues,crossword_year,day_of_week
0,246591,1976-08-04,EDEN,Barbara of TV,1976,Wednesday
1,240444,1976-05-26,GAP,"Generation, for one",1976,Wednesday
2,240443,1976-05-26,PRIAM,Trojan king,1976,Wednesday
3,240442,1976-05-26,LOON,Hendrik Willem van ___,1976,Wednesday
4,240441,1976-05-26,TRAPS,Catches,1976,Wednesday
...,...,...,...,...,...,...
1422632,217636,2022-03-05,MOSS,Much of Iceland's greenery,2022,Saturday
1422633,217637,2022-03-05,PROVEIT,"""I don't believe you""",2022,Saturday
1422634,217638,2022-03-05,CONEOFSILENCE,Fictional device in which to convey secret inf...,2022,Saturday
1422635,217640,2022-03-05,MOE,"""Calvin and Hobbes"" character described as ""a ...",2022,Saturday


In [4]:
count_words = df['crossword_year'].value_counts().reset_index()
count_words

alt.Chart(count_words).mark_bar().encode(
    x='index',
    y='crossword_year'
)

In [5]:
# df[df['answers'].str.contains('YOGA', regex=False) == True].head(60)
df[df['answers'].str.contains('CHAI', regex=False) == True].head(60)


Unnamed: 0.1,Unnamed: 0,crossword_date,answers,clues,crossword_year,day_of_week
4181,238569,1976-05-04,CHAIN,Daisy or ball-and,1976,Tuesday
8004,242391,1976-06-17,CHAISE,___ longue,1976,Thursday
8238,243772,1976-07-03,CHAITS,Hindu months,1976,Saturday
8255,243719,1976-07-03,CHAIRS,Musical ___,1976,Saturday
19756,255691,1976-11-17,CHAILLOT,Madwoman's place,1976,Wednesday
20640,253175,1976-10-19,CHAI,Gypsy girl,1976,Tuesday
29474,252470,1976-10-10,CHAIR,Preside over,1976,Sunday
51553,261380,1977-01-21,CHAIR,Morris or electric,1977,Friday
53243,262942,1977-02-08,CHAIRMAN,One with a gavel,1977,Tuesday
59112,268791,1977-04-17,AITCHAITCHAITCH,Humphrey,1977,Sunday


#### Plot the number of slang terms over time

In [12]:
slang = df[df['clues'].str.contains('in texts', regex=False) |df['clues'].str.contains('colloquially', regex=False) |df['clues'].str.contains('modern lingo', regex=False) | df['clues'].str.contains('slang', regex=False) | df['clues'].str.contains('informally', regex=False) | df['clues'].str.contains('slangily', regex=False)]
count_slang = slang['crossword_year'].value_counts().reset_index()

count_slang.sort_values('index')

Unnamed: 0,index,crossword_year
44,1977,1
43,1978,3
36,1979,8
32,1980,11
31,1981,13
39,1982,7
35,1983,9
34,1984,9
37,1985,8
42,1987,4


#### Plot which years debuted the most slang terms

In [6]:
debut_slang = slang.sort_values(by = 'crossword_date').drop_duplicates(subset=['answers'], keep='first')
debut_slang_year = debut_slang.crossword_year.value_counts().reset_index()

alt.Chart(debut_slang_year).mark_bar().encode(
    x='index',
    y='crossword_year'
)


#### I exported the above dataframe and added the part of speech and categorized some of these words into this csv

In [16]:
count = 0 
debut_detailed = pd.read_csv("debut_slang.csv")
debut_detailed['category'] = debut_detailed['category'].str.split(",")
categories = debut_detailed[debut_detailed['category'].isna() == False]
categories

Unnamed: 0,crossword_date,answers,clues,crossword_year,day_of_week,category,part_of_speech
1,1978-03-19,LIBBER,"Feminist, colloquially",1978,Sunday,"[women, gender, politics]",noun
5,1979-02-25,GAL,"Colleen, colloquially",1979,Sunday,"[women, gender]",noun
7,1979-07-26,BLUES,"Navy uniform, informally",1979,Thursday,"[occupation, military]",noun
8,1979-08-14,HEEL,"Cad, colloquially",1979,Tuesday,"[men, gender]",noun
9,1979-09-02,ERK,"R.A.F. underling, informally",1979,Sunday,"[occupation, military]",noun
...,...,...,...,...,...,...,...
2350,2022-06-19,ACIDHEAD,"Lover of psychedelics, informally",2022,Sunday,[drugs/cigs],noun
2351,2022-06-23,DESI,"South Asian, informally",2022,Thursday,[native of...],noun
2352,2022-06-26,LITMAJOR,"One with a storied education, informally?",2022,Sunday,[school],noun
2354,2022-06-30,ROTO,"Kind of scoring in fantasy sports leagues, inf...",2022,Thursday,[sports],noun


In [19]:
categories = categories.explode('category')
categories['category'].value_counts().reset_index().head(60)


Unnamed: 0,index,category
0,occupation,125
1,drink/food,93
2,arts,88
3,sports,77
4,money,73
5,school,61
6,fashion,59
7,music,52
8,relationship,48
9,gender,48


In [1]:
categories[categories['day_of_week'] == 'Monday'].category.value_counts().head(5)

NameError: name 'categories' is not defined

In [66]:
categories[categories['day_of_week'] == 'Tuesday'].category.value_counts().head(5)

occupation    22
sports        15
military      15
fashion       13
arts          13
Name: category, dtype: int64

In [65]:
categories[categories['day_of_week'] == 'Wednesday'].category.value_counts().head(5)

occupation    16
sports        15
drink/food    11
arts          10
fashion        9
Name: category, dtype: int64

In [64]:
categories[categories['day_of_week'] == 'Thursday'].category.value_counts().head(5)

occupation      23
drink/food      16
sports          15
arts            14
relationship    11
Name: category, dtype: int64

In [63]:
categories[categories['day_of_week'] == 'Friday'].category.value_counts().head(5)

occupation    17
drink/food    16
arts          15
sports        15
music         10
Name: category, dtype: int64

In [62]:
categories[categories['day_of_week'] == 'Saturday'].category.value_counts().head(5)

occupation    25
sports        15
gender        13
money         11
fashion       10
Name: category, dtype: int64

In [61]:
categories[categories['day_of_week'] == 'Sunday'].category.value_counts().head(5)

occupation      29
sports          27
relationship    22
drink/food      21
arts            19
Name: category, dtype: int64

In [76]:
debut_detailed.part_of_speech.value_counts()

noun         1949
verb          221
adjective     162
idk            24
pronoun         1
Name: part_of_speech, dtype: int64

In [122]:
categories.groupby(["crossword_year"])["category"].value_counts(ascending=False).groupby(level=0, group_keys=False).nlargest(5).to_frame().tail(60)

# categories[categories['crossword_year'] == 2015].category.value_counts().head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,category
crossword_year,category,Unnamed: 2_level_1
2011,drink/food,6
2011,age,3
2011,military,3
2011,gender,3
2011,money,3
2012,sports,5
2012,arts,4
2012,appearance,4
2012,cool/awesome,3
2012,team name,3
