# NLP Project Pt. 2: Data Cleaning

In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_colwidth',200)
corpus = pd.read_csv('corpus_df3.csv')
#new new corpus: 'corpus_df3.csv'
corpus.head()

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT
0,https://www.newyorker.com/magazine/2022/02/14/annunciation,Annunciation,Lauren Groff,"February 7, 2022","['Some nights, in my dreams, I find myself running through those hills above Palo Alto again. It is always just before dawn, and as I run I smell the sun-crisped fields, the sage, the eucalyptus. ..."
1,https://www.newyorker.com/magazine/2022/02/07/once-removed,Once Removed,Alexander MacLeod,"January 31, 2022","['She did not want to visit the old lady.', 'Amy studied the stroller, then the bags, then her boyfriend and the baby. She checked her phone: 11:26\xa0a.m. It was time to go. Ninety degrees, ninet..."
2,https://www.newyorker.com/magazine/2022/01/31/long-distance,Long Distance,Ayşegül Savaş,"January 24, 2022","['Lea changed the sheets when she got up. She’d bought flowers the previous day, tulips that she’d put on the dresser. There were carnations on the kitchen table, in a squat glass vase. She though..."
3,https://www.newyorker.com/magazine/2022/01/24/whats-the-deal-hummingbird,"What’s the Deal, Hummingbird?",Arthur Krystal,"January 17, 2022","['On or around May 5th of 2020, he just stopped. He stopped exercising, stopped walking, stopped reading, stopped planning. He ate, drank, washed, and paid the bills, but that was it. He was seven..."
4,https://www.newyorker.com/magazine/2022/01/17/fireworks,Fireworks,Graham Swift,"January 10, 2022","['It was late October, 1962. Russian missiles were being shipped to Cuba. Kennedy was having words with Khrushchev. The world might be coming to an end.', 'It was a common remark: “Cheer up, it’s ..."


In [2]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 988 entries, 0 to 987
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL     988 non-null    object
 1   TITLE   988 non-null    object
 2   AUTHOR  988 non-null    object
 3   DATE    988 non-null    object
 4   TEXT    988 non-null    object
dtypes: object(5)
memory usage: 38.7+ KB


In [3]:
#check for duplicates
corpus[corpus.duplicated()]

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT


In [4]:
#look for more synopses
for count, text in enumerate(corpus.TEXT):
    if ("short story by" in text[0:100]) | \
    ("Short story about" in text[0:100]) | \
    ("Short story in which" in text[0:100]):
        print(count, corpus.loc[count].TITLE, text[0:100])

740 Demolition ['The New Yorker, December 25, 2006 P. 70', "Short story about a young cemetery worker's affair with
760 The Spot ['The New Yorker, August 21, 2006 P. 70', "Short story about a teen-age prostitute in Ohio who murde
761 Bad Neighbors ['The New Yorker, August 7, 2006 P. 72', "Short story about the Benningtons, a working-class black f
762 First Defeat (1939) ['The New Yorker, July 31, 2006 P. 66', 'Short story about a quartermaster who surrenders to the los
763 Folie à Deux ['The New Yorker, July 24, 2006 P. 60', 'Short story about a chance encounter a middle-aged man has 
767 Accident Brief ['The New Yorker, June 19, 2006 P. 74', "Short story about a boy, Tek, who accompanies a mute on a b
773 Cinderella School ['The New Yorker, May 22, 2006 P. 68', "Short story about a Russian immigrant woman in Brooklyn who 
794 Beauty Is A Fate Worse Than Death ['The New Yorker, December 26, 2005 P. 108', 'Short story about a Moroccan lawyer who dreams that hi
804 Summer Crossing ['The 

In [5]:
#drop synopses
def synopses_finder(corpus):
    synopses = []
    for count, text in enumerate(corpus.TEXT):
        if ("short story by" in text) or ("Short story about" in text) or ("Short story in which" in text):
            title = corpus.TITLE[count]
            author = corpus.AUTHOR[count]
            synopses.append((title, author))
    return synopses

In [6]:
synopses = synopses_finder(corpus)
synopses

[('Demolition', 'Louise Erdrich'),
 ('The Spot', 'David Means'),
 ('Bad Neighbors', 'Edward P. Jones'),
 ('First Defeat (1939)', 'Alberto Méndez'),
 ('Folie à Deux', 'William Trevor'),
 ('Accident Brief', 'Karen Russell'),
 ('God the Novelist', 'Henry Roth'),
 ('Cinderella School', 'Lara Vapnyar'),
 ('Beauty Is A Fate Worse Than Death', 'Tahar Ben Jelloun'),
 ('Summer Crossing', 'Truman Capote'),
 ('The Kidney-Shaped Stone That Moves Every Day', 'Haruki Murakami'),
 ('The Blow', 'J. M. Coetzee'),
 ('Where I’m Likely To Find It', 'Haruki Murakami'),
 ('Foreigners', 'Andrew O’Hagan'),
 ('In the Palace of the End', 'Martin Amis'),
 ('Ice Man', 'Haruki Murakami'),
 ('Without Blood', 'Alessandro Baricco'),
 ('Jolene: A Life', 'E. L. Doctorow'),
 ('The Ant of the Self', 'ZZ Packer'),
 ('The Bees, Part 1', 'Aleksander Hemon'),
 ('The Obscure Object', 'Jeffrey Eugenides'),
 ('Sacred Statues', 'William Trevor'),
 ('Airplane', 'Haruki Murakami'),
 ('The Courtesy', 'John Berger'),
 ('A Primer for

In [7]:
corpus['SYNOPSIS'] = np.nan

In [8]:
synopses_dict = dict(synopses)
len(synopses_dict)

33

In [9]:
def synopses_matcher(df, reference):
        for k, v in reference.items():
            if (df['TITLE'] == k) & (df['AUTHOR'] == v):
                return 1
            
corpus['SYNOPSIS'] = corpus.apply(synopses_matcher, reference=synopses_dict, axis=1)
corpus.SYNOPSIS.value_counts()

1.0    33
Name: SYNOPSIS, dtype: int64

In [10]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 988 entries, 0 to 987
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   URL       988 non-null    object 
 1   TITLE     988 non-null    object 
 2   AUTHOR    988 non-null    object 
 3   DATE      988 non-null    object 
 4   TEXT      988 non-null    object 
 5   SYNOPSIS  33 non-null     float64
dtypes: float64(1), object(5)
memory usage: 46.4+ KB


In [11]:
corpus = corpus.loc[corpus['SYNOPSIS'].isna()==True]
corpus.drop(columns=['SYNOPSIS'], inplace=True)
corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 955 entries, 0 to 987
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL     955 non-null    object
 1   TITLE   955 non-null    object
 2   AUTHOR  955 non-null    object
 3   DATE    955 non-null    object
 4   TEXT    955 non-null    object
dtypes: object(5)
memory usage: 44.8+ KB


In [12]:
corpus.reset_index(drop=True, inplace=True)

In [13]:
#checking again
for count, text in enumerate(corpus.TEXT):
    if ("The New Yorker" in text[0:100]):
        print(count, corpus.loc[count].TITLE, text[0:100])

747 Greensleeves ['The New Yorker, November 13, 2006 P. 84', '“Gardening!” the girl said, and tilted back in her chai
755 Something That Needs Nothing ['The New Yorker, September 18, 2006 P. 69', 'In an ideal world, we would have been orphans. We felt
765 In Kislovodsk ['The New Yorker, June 12, 2006 P. 58', "Short Story about a doctor, Nikolay Victorovich, and his wi
769 The Last Days of Muhammad Atta ['The New Yorker, April 24, 2006 P. 152', 'Short story imagining the last few days of the life of Mu
774 A Love Letter ['The New Yorker, March 27, 2006 P. 66', 'First, I would like to fall on my knees in front of the I.
786 Last Evenings on Earth ['The New Yorker, December 26, 2005 P. 80', 'This is the situation: B and his father are going on va
788 LA CONCHITA ['The New Yorker, December 12, 2005 P. 95', 'In my business, where you put something like forty to f
818 The Conductor ['The New Yorker, February 28, 2005 P. 70', 'In the 1989 “Anthology of Contemporary Bosnian Poetry,”
867 Extra 

In [14]:
#drop In Kislovodsk, The Last Days of Muhammad Atta, The Card Trick, Lost and Found
corpus.loc[(corpus['TITLE'] == 'In Kislovodsk') | \
           (corpus['TITLE'] == 'The Last Days of Muhammad Atta') | \
           (corpus['TITLE'] == 'The Card Trick') | \
           (corpus['TITLE'] == 'Lost and Found'), 'TITLE'] = None

In [15]:
corpus = corpus[corpus.TITLE.isna()==False]
corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 951 entries, 0 to 954
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL     951 non-null    object
 1   TITLE   951 non-null    object
 2   AUTHOR  951 non-null    object
 3   DATE    951 non-null    object
 4   TEXT    951 non-null    object
dtypes: object(5)
memory usage: 44.6+ KB


In [16]:
corpus.reset_index(drop=True, inplace=True)

In [17]:
corpus.head()

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT
0,https://www.newyorker.com/magazine/2022/02/14/annunciation,Annunciation,Lauren Groff,"February 7, 2022","['Some nights, in my dreams, I find myself running through those hills above Palo Alto again. It is always just before dawn, and as I run I smell the sun-crisped fields, the sage, the eucalyptus. ..."
1,https://www.newyorker.com/magazine/2022/02/07/once-removed,Once Removed,Alexander MacLeod,"January 31, 2022","['She did not want to visit the old lady.', 'Amy studied the stroller, then the bags, then her boyfriend and the baby. She checked her phone: 11:26\xa0a.m. It was time to go. Ninety degrees, ninet..."
2,https://www.newyorker.com/magazine/2022/01/31/long-distance,Long Distance,Ayşegül Savaş,"January 24, 2022","['Lea changed the sheets when she got up. She’d bought flowers the previous day, tulips that she’d put on the dresser. There were carnations on the kitchen table, in a squat glass vase. She though..."
3,https://www.newyorker.com/magazine/2022/01/24/whats-the-deal-hummingbird,"What’s the Deal, Hummingbird?",Arthur Krystal,"January 17, 2022","['On or around May 5th of 2020, he just stopped. He stopped exercising, stopped walking, stopped reading, stopped planning. He ate, drank, washed, and paid the bills, but that was it. He was seven..."
4,https://www.newyorker.com/magazine/2022/01/17/fireworks,Fireworks,Graham Swift,"January 10, 2022","['It was late October, 1962. Russian missiles were being shipped to Cuba. Kennedy was having words with Khrushchev. The world might be coming to an end.', 'It was a common remark: “Cheer up, it’s ..."


In [19]:
import re
import string

#add wordcount column:
corpus['TEXT_STRIPPED'] = corpus.TEXT.map(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
corpus['TEXT_STRIPPED'] = corpus.TEXT_STRIPPED.map(lambda x: re.sub('xa0', ' ', x))
corpus['TEXT_WORDS'] = corpus.TEXT_STRIPPED.map(lambda x: re.split(' ', x))
corpus.head()

In [21]:
corpus['WORDCOUNT'] = corpus.TEXT_WORDS.map(lambda x: len(x))
corpus.head()

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT,TEXT_STRIPPED,TEXT_WORDS,WORDCOUNT
0,https://www.newyorker.com/magazine/2022/02/14/annunciation,Annunciation,Lauren Groff,"February 7, 2022","['Some nights, in my dreams, I find myself running through those hills above Palo Alto again. It is always just before dawn, and as I run I smell the sun-crisped fields, the sage, the eucalyptus. ...",Some nights in my dreams I find myself running through those hills above Palo Alto again It is always just before dawn and as I run I smell the suncrisped fields the sage the eucalyptus The mist f...,"[Some, nights, in, my, dreams, I, find, myself, running, through, those, hills, above, Palo, Alto, again, It, is, always, just, before, dawn, and, as, I, run, I, smell, the, suncrisped, fields, th...",9373
1,https://www.newyorker.com/magazine/2022/02/07/once-removed,Once Removed,Alexander MacLeod,"January 31, 2022","['She did not want to visit the old lady.', 'Amy studied the stroller, then the bags, then her boyfriend and the baby. She checked her phone: 11:26\xa0a.m. It was time to go. Ninety degrees, ninet...",She did not want to visit the old lady Amy studied the stroller then the bags then her boyfriend and the baby She checked her phone 1126 am It was time to go Ninety degrees ninetypercent humidity ...,"[She, did, not, want, to, visit, the, old, lady, Amy, studied, the, stroller, then, the, bags, then, her, boyfriend, and, the, baby, She, checked, her, phone, 1126, am, It, was, time, to, go, Nine...",7778
2,https://www.newyorker.com/magazine/2022/01/31/long-distance,Long Distance,Ayşegül Savaş,"January 24, 2022","['Lea changed the sheets when she got up. She’d bought flowers the previous day, tulips that she’d put on the dresser. There were carnations on the kitchen table, in a squat glass vase. She though...",Lea changed the sheets when she got up She’d bought flowers the previous day tulips that she’d put on the dresser There were carnations on the kitchen table in a squat glass vase She thought they ...,"[Lea, changed, the, sheets, when, she, got, up, She’d, bought, flowers, the, previous, day, tulips, that, she’d, put, on, the, dresser, There, were, carnations, on, the, kitchen, table, in, a, squ...",4866
3,https://www.newyorker.com/magazine/2022/01/24/whats-the-deal-hummingbird,"What’s the Deal, Hummingbird?",Arthur Krystal,"January 17, 2022","['On or around May 5th of 2020, he just stopped. He stopped exercising, stopped walking, stopped reading, stopped planning. He ate, drank, washed, and paid the bills, but that was it. He was seven...",On or around May 5th of 2020 he just stopped He stopped exercising stopped walking stopped reading stopped planning He ate drank washed and paid the bills but that was it He was seventythree He’d ...,"[On, or, around, May, 5th, of, 2020, he, just, stopped, He, stopped, exercising, stopped, walking, stopped, reading, stopped, planning, He, ate, drank, washed, and, paid, the, bills, but, that, wa...",3469
4,https://www.newyorker.com/magazine/2022/01/17/fireworks,Fireworks,Graham Swift,"January 10, 2022","['It was late October, 1962. Russian missiles were being shipped to Cuba. Kennedy was having words with Khrushchev. The world might be coming to an end.', 'It was a common remark: “Cheer up, it’s ...",It was late October 1962 Russian missiles were being shipped to Cuba Kennedy was having words with Khrushchev The world might be coming to an end It was a common remark “Cheer up it’s not the end ...,"[It, was, late, October, 1962, Russian, missiles, were, being, shipped, to, Cuba, Kennedy, was, having, words, with, Khrushchev, The, world, might, be, coming, to, an, end, It, was, a, common, rem...",2687


In [22]:
corpus.sort_values(['WORDCOUNT'])[0:20]

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT,TEXT_STRIPPED,TEXT_WORDS,WORDCOUNT
668,https://www.newyorker.com/magazine/2008/05/05/what-they-believe-in,Sketchbook by Roz Chast,Roz Chast,"April 28, 2008",[],,[],1
928,https://www.newyorker.com/magazine/2002/01/07/jamaica,Jamaica,David Schickler,"December 30, 2001",[],,[],1
927,https://www.newyorker.com/magazine/2002/01/14/justinas-priest,Justina’s Priest,William Trevor,"January 6, 2002",[],,[],1
923,https://www.newyorker.com/magazine/2002/02/18/a-new-master,A New Master,Hannah Crafts,"February 10, 2002",[],,[],1
922,https://www.newyorker.com/magazine/2002/03/04/fascination-2,Fascination,William Boyd,"February 24, 2002",[],,[],1
55,https://www.newyorker.com/magazine/2020/12/28/acting-class,Acting Class,Nick Drnaso,"December 21, 2020","['This excerpt is drawn from “Acting Class,” by Nick Drnaso, out in 2022 from Drawn & Quarterly.']",This excerpt is drawn from “Acting Class” by Nick Drnaso out in 2022 from Drawn Quarterly,"[This, excerpt, is, drawn, from, “Acting, Class”, by, Nick, Drnaso, out, in, 2022, from, Drawn, , Quarterly]",17
930,https://www.newyorker.com/magazine/2001/12/10/surrounded-by-sleep,Surrounded by Sleep,Akhil Sharma,"December 2, 2001","['The New Yorker, December 10, 2001 P. 90', 'One August afternoon, when Ajay was ten years old, his elder brother, Aman, dove into a pool and struck his head on the cement bottom. For three minute...",The New Yorker December 10 2001 P 90 One August afternoon when Ajay was ten years old his elder brother Aman dove into a pool and struck his head on the cement bottom For three minutes he lay ther...,"[The, New, Yorker, December, 10, 2001, P, 90, One, August, afternoon, when, Ajay, was, ten, years, old, his, elder, brother, Aman, dove, into, a, pool, and, struck, his, head, on, the, cement, bot...",275
327,https://www.newyorker.com/magazine/2015/06/08/love-is-blind-and-deaf,Love Is Blind and Deaf,Jonathan Safran Foer,"June 1, 2015","['Adam and Eve lived together happily for a few days. Being blind, Adam never had to see the oblong, splotchy birthmark across Eve’s cheek, or her rotated incisor, or the gnawed remnants of her fi...",Adam and Eve lived together happily for a few days Being blind Adam never had to see the oblong splotchy birthmark across Eve’s cheek or her rotated incisor or the gnawed remnants of her fingernai...,"[Adam, and, Eve, lived, together, happily, for, a, few, days, Being, blind, Adam, never, had, to, see, the, oblong, splotchy, birthmark, across, Eve’s, cheek, or, her, rotated, incisor, or, the, g...",593
432,https://www.newyorker.com/magazine/2013/06/10/slide-to-unlock,Slide To Unlock,Ed Park,"June 3, 2013","['You cycle through your passwords. They tell the secret story. What’s most important to you, the things you think can’t be deciphered. Words and numbers stored in the lining of your heart.', 'You...",You cycle through your passwords They tell the secret story What’s most important to you the things you think can’t be deciphered Words and numbers stored in the lining of your heart Your daughter...,"[You, cycle, through, your, passwords, They, tell, the, secret, story, What’s, most, important, to, you, the, things, you, think, can’t, be, deciphered, Words, and, numbers, stored, in, the, linin...",823
686,https://www.newyorker.com/magazine/2007/12/24/the-arbus-factor,The Arbus Factor,Lore Segal,"December 16, 2007","['On one of the first days of the New Year, Jack called Hope. “Let’s have lunch,” he said. “I’ve got an agenda.” No need to specify the Café Provence on upper Broadway, or the time—fifteen minutes...",On one of the first days of the New Year Jack called Hope “Let’s have lunch” he said “I’ve got an agenda” No need to specify the Café Provence on upper Broadway or the time—fifteen minutes before ...,"[On, one, of, the, first, days, of, the, New, Year, Jack, called, Hope, “Let’s, have, lunch”, he, said, “I’ve, got, an, agenda”, No, need, to, specify, the, Café, Provence, on, upper, Broadway, or...",905


In [23]:
#filtering out more synopses based on wordcount
corpus = corpus[corpus['WORDCOUNT'] > 450]
corpus_df.reset_index(inplace=True, drop=True)
corpus.sort_values(['WORDCOUNT'])[0:10]

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT,TEXT_STRIPPED,TEXT_WORDS,WORDCOUNT
327,https://www.newyorker.com/magazine/2015/06/08/love-is-blind-and-deaf,Love Is Blind and Deaf,Jonathan Safran Foer,"June 1, 2015","['Adam and Eve lived together happily for a few days. Being blind, Adam never had to see the oblong, splotchy birthmark across Eve’s cheek, or her rotated incisor, or the gnawed remnants of her fi...",Adam and Eve lived together happily for a few days Being blind Adam never had to see the oblong splotchy birthmark across Eve’s cheek or her rotated incisor or the gnawed remnants of her fingernai...,"[Adam, and, Eve, lived, together, happily, for, a, few, days, Being, blind, Adam, never, had, to, see, the, oblong, splotchy, birthmark, across, Eve’s, cheek, or, her, rotated, incisor, or, the, g...",593
432,https://www.newyorker.com/magazine/2013/06/10/slide-to-unlock,Slide To Unlock,Ed Park,"June 3, 2013","['You cycle through your passwords. They tell the secret story. What’s most important to you, the things you think can’t be deciphered. Words and numbers stored in the lining of your heart.', 'You...",You cycle through your passwords They tell the secret story What’s most important to you the things you think can’t be deciphered Words and numbers stored in the lining of your heart Your daughter...,"[You, cycle, through, your, passwords, They, tell, the, secret, story, What’s, most, important, to, you, the, things, you, think, can’t, be, deciphered, Words, and, numbers, stored, in, the, linin...",823
686,https://www.newyorker.com/magazine/2007/12/24/the-arbus-factor,The Arbus Factor,Lore Segal,"December 16, 2007","['On one of the first days of the New Year, Jack called Hope. “Let’s have lunch,” he said. “I’ve got an agenda.” No need to specify the Café Provence on upper Broadway, or the time—fifteen minutes...",On one of the first days of the New Year Jack called Hope “Let’s have lunch” he said “I’ve got an agenda” No need to specify the Café Provence on upper Broadway or the time—fifteen minutes before ...,"[On, one, of, the, first, days, of, the, New, Year, Jack, called, Hope, “Let’s, have, lunch”, he, said, “I’ve, got, an, agenda”, No, need, to, specify, the, Café, Provence, on, upper, Broadway, or...",905
688,https://www.newyorker.com/magazine/2007/12/24/alma,Alma,Junot Díaz,"December 16, 2007","['You have a girlfriend named Alma, who has a long tender horse neck and a big Dominican ass that seems to exist in a fourth dimension beyond jeans. An ass that could drag the moon out of orbit. A...",You have a girlfriend named Alma who has a long tender horse neck and a big Dominican ass that seems to exist in a fourth dimension beyond jeans An ass that could drag the moon out of orbit An ass...,"[You, have, a, girlfriend, named, Alma, who, has, a, long, tender, horse, neck, and, a, big, Dominican, ass, that, seems, to, exist, in, a, fourth, dimension, beyond, jeans, An, ass, that, could, ...",983
296,https://www.newyorker.com/magazine/2016/01/25/aspic-fiction-tatyana-tolstaya,Aspic,Tatyana Tolstaya,"January 17, 2016","['Truth be told, I’ve always been afraid of it, since childhood. It’s prepared not casually, or whenever the fancy strikes you, but most often for New Year’s Eve, in the heart of winter, in the sh...",Truth be told I’ve always been afraid of it since childhood It’s prepared not casually or whenever the fancy strikes you but most often for New Year’s Eve in the heart of winter in the shortest an...,"[Truth, be, told, I’ve, always, been, afraid, of, it, since, childhood, It’s, prepared, not, casually, or, whenever, the, fancy, strikes, you, but, most, often, for, New, Year’s, Eve, in, the, hea...",1032
617,https://www.newyorker.com/magazine/2009/05/11/the-autobiography-of-j-g-b,The Autobiography of J.G.B.,J. G. Ballard,"May 4, 2009","['On waking one morning, B was surprised to see that Shepperton was deserted. He entered the kitchen at nine o’clock, annoyed to find that neither his post nor the daily newspapers had been delive...",On waking one morning B was surprised to see that Shepperton was deserted He entered the kitchen at nine o’clock annoyed to find that neither his post nor the daily newspapers had been delivered a...,"[On, waking, one, morning, B, was, surprised, to, see, that, Shepperton, was, deserted, He, entered, the, kitchen, at, nine, o’clock, annoyed, to, find, that, neither, his, post, nor, the, daily, ...",1032
543,https://www.newyorker.com/magazine/2011/03/14/going-for-a-beer,Going for a Beer,Robert Coover,"March 6, 2011","['He finds himself sitting in the neighborhood bar drinking a beer at about the same time that he began to think about going there for one. In fact, he has finished it. Perhaps he’ll have a second...",He finds himself sitting in the neighborhood bar drinking a beer at about the same time that he began to think about going there for one In fact he has finished it Perhaps he’ll have a second one ...,"[He, finds, himself, sitting, in, the, neighborhood, bar, drinking, a, beer, at, about, the, same, time, that, he, began, to, think, about, going, there, for, one, In, fact, he, has, finished, it,...",1079
400,https://www.newyorker.com/magazine/2014/01/27/the-frog-prince,The Frog Prince,Robert Coover,"January 19, 2014","['At first, it was great. Sure. It always is. She cuddled a frog, wishing for more, and—presto! A handsome prince who doted on her. It meant the end of her marriage, of course, but her ex was some...",At first it was great Sure It always is She cuddled a frog wishing for more and—presto A handsome prince who doted on her It meant the end of her marriage of course but her ex was something of a t...,"[At, first, it, was, great, Sure, It, always, is, She, cuddled, a, frog, wishing, for, more, and—presto, A, handsome, prince, who, doted, on, her, It, meant, the, end, of, her, marriage, of, cours...",1157
474,https://www.newyorker.com/magazine/2012/08/06/thank-you-for-the-light,Thank You for the Light,F. Scott Fitzgerald,"July 30, 2012","['Mrs. Hanson was a pretty, somewhat faded woman of forty, who sold corsets and girdles, travelling out of Chicago. For many years her territory had swung around through Toledo, Lima, Springfield,...",Mrs Hanson was a pretty somewhat faded woman of forty who sold corsets and girdles travelling out of Chicago For many years her territory had swung around through Toledo Lima Springfield Columbus ...,"[Mrs, Hanson, was, a, pretty, somewhat, faded, woman, of, forty, who, sold, corsets, and, girdles, travelling, out, of, Chicago, For, many, years, her, territory, had, swung, around, through, Tole...",1171
253,https://www.newyorker.com/magazine/2016/11/28/the-hanging-of-the-schoolmarm,The Hanging of the Schoolmarm,Robert Coover,"November 20, 2016","['The schoolmarm is playing poker in the town saloon. The stake is the saloon itself. As she is preparing to deal the cards, one of the men demands that she cut the fuckin’ deck, and she shoots hi...",The schoolmarm is playing poker in the town saloon The stake is the saloon itself As she is preparing to deal the cards one of the men demands that she cut the fuckin’ deck and she shoots him from...,"[The, schoolmarm, is, playing, poker, in, the, town, saloon, The, stake, is, the, saloon, itself, As, she, is, preparing, to, deal, the, cards, one, of, the, men, demands, that, she, cut, the, fuc...",1200


In [24]:
#drop unnecessary columns: TEXT_WORDS and TEXT_STRIPPED
corpus = corpus.drop(columns=['TEXT_WORDS', 'TEXT_STRIPPED'])

In [25]:
#replace non-English characters in author names
#uppercase titles and authors
#!pip install unidecode

from unidecode import unidecode

def name_cleaning(text):
    text = unidecode(text)
    text = text.upper()
    return text

In [26]:
corpus['TITLE'] = corpus.TITLE.map(lambda x: name_cleaning(x))
corpus['AUTHOR'] = corpus.AUTHOR.map(lambda x: name_cleaning(x))
corpus.head()

Unnamed: 0,URL,TITLE,AUTHOR,DATE,TEXT,WORDCOUNT
0,https://www.newyorker.com/magazine/2022/02/14/annunciation,ANNUNCIATION,LAUREN GROFF,"February 7, 2022","['Some nights, in my dreams, I find myself running through those hills above Palo Alto again. It is always just before dawn, and as I run I smell the sun-crisped fields, the sage, the eucalyptus. ...",9373
1,https://www.newyorker.com/magazine/2022/02/07/once-removed,ONCE REMOVED,ALEXANDER MACLEOD,"January 31, 2022","['She did not want to visit the old lady.', 'Amy studied the stroller, then the bags, then her boyfriend and the baby. She checked her phone: 11:26\xa0a.m. It was time to go. Ninety degrees, ninet...",7778
2,https://www.newyorker.com/magazine/2022/01/31/long-distance,LONG DISTANCE,AYSEGUL SAVAS,"January 24, 2022","['Lea changed the sheets when she got up. She’d bought flowers the previous day, tulips that she’d put on the dresser. There were carnations on the kitchen table, in a squat glass vase. She though...",4866
3,https://www.newyorker.com/magazine/2022/01/24/whats-the-deal-hummingbird,"WHAT'S THE DEAL, HUMMINGBIRD?",ARTHUR KRYSTAL,"January 17, 2022","['On or around May 5th of 2020, he just stopped. He stopped exercising, stopped walking, stopped reading, stopped planning. He ate, drank, washed, and paid the bills, but that was it. He was seven...",3469
4,https://www.newyorker.com/magazine/2022/01/17/fireworks,FIREWORKS,GRAHAM SWIFT,"January 10, 2022","['It was late October, 1962. Russian missiles were being shipped to Cuba. Kennedy was having words with Khrushchev. The world might be coming to an end.', 'It was a common remark: “Cheer up, it’s ...",2687


In [27]:
corpus.to_pickle("corpus_df3_cleaned.pkl")