# Constants

Use these constants to control the size of each topic \
None = whole dataset \
0 = exclude topic \
SEED (controls randomness) is defined in **constants.py**

In [96]:
MAX_SAMPLES_PER_TOPIC = 10000  # sampled dataset size = min(MAX_SAMPLES, len(dataset)

CLIMATE_CHANGE = MAX_SAMPLES_PER_TOPIC
SELFDRIVING_CARS = MAX_SAMPLES_PER_TOPIC
WEATHER = MAX_SAMPLES_PER_TOPIC
AIRLINE_SUPPORT = MAX_SAMPLES_PER_TOPIC
FIFA_WORLD_CUP = MAX_SAMPLES_PER_TOPIC
AUSTRALIAN_ELECTIONS = MAX_SAMPLES_PER_TOPIC
COVID19 = MAX_SAMPLES_PER_TOPIC
CHATGPT = MAX_SAMPLES_PER_TOPIC
STOCK_MARKET_CRASH = MAX_SAMPLES_PER_TOPIC

DUPLICATES_FRACTION = 0.1
SUPERSAMPLE_COEF = 1 + DUPLICATES_FRACTION

In [97]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from constants import SEED
from utils import save_dataset, load_dataset

pd.set_option('display.max_colwidth', None)
rng = np.random.default_rng(seed=SEED)
dataset = None
    
def concat(d1, d2):
    return pd.concat([d1, d2], axis=0, ignore_index=True)

def select_columns(df, columns):
    return df.drop(columns=list(set(df.columns) - set(columns)))

def sample(df, n):
    if n is None:
        return df
    elif n == 0:
        return pd.DataFrame()
    else:
        return df.sample(min(n, len(df)), random_state=SEED)
    
def supersample(df, n):
    n = int(n * SUPERSAMPLE_COEF)
    return sample(df, n) if len(df) > n else df

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [98]:
import regex as re
import emoji

RE_INVALID_CHARS = re.compile(r"[\p{Cc}\p{Cs}]+")
LOWERCASE_ALPHABET = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'}
UNICODE_ALLOWED_CHARS = "’"

def remove_invalid_chars(s):
    return RE_INVALID_CHARS.sub('', s)

def consists_of_allowed_chars(s):
    return all([
        c.isascii() or c in UNICODE_ALLOWED_CHARS or emoji.is_emoji(c)
        for c in s
    ])

def contains_substrings(s, ss, negation=False, case=True):
    s = s if case else s.lower()
    return any(map(s.__contains__, ss)) ^ negation

def replace_bad_chars(s):
    if len(s) > 0:
        s = re.sub('Ì¢âÂå|Ì¢âÂÒ', ' ', s, flags=re.IGNORECASE)
        s = re.sub('(â€™|Ã¢â‚¬â„¢|Ì¢âÂã¢|Ì¢âÂã¢)', "'", s, flags=re.IGNORECASE)
        s = re.sub('(&amp;amp;|&amp;)', '&',  s, flags=re.IGNORECASE)
        s = re.sub('\$q\$', '"', s, flags=re.IGNORECASE)
        s = re.sub('n"t', "n't", s, flags=re.IGNORECASE)
        s = re.sub(' [ ]+', ' ', s)
    return s.strip()

def string_frequency(df, s):
    return sum([contains_substrings(t, [s], case=False) for t in df['text']]) / len(df)

def randomly_remove_substring(df, ss, p=0.5, case=True):
    ids = df.index[df['text'].apply(lambda s: contains_substrings(s, [ss], case=case))]
    ids = rng.choice(ids, int(p * len(ids)), replace=False)
    df.loc[ids, 'text'] = df.loc[ids, 'text'].str.replace(ss, '', case=case)
    
    f = len(df.index[df['text'].apply(lambda s: contains_substrings(s, [ss], case=case))]) / len(df)
    print(f'"{ss}" new frequency: {f:.2f}')
    
    return df

def remove_retweet_mark(tweet):
    if tweet.lower().startswith('rt @'):
        if ':' in tweet:
            return tweet.split(':')[1]
        else:
            tokens = tweet.split(' ', 2)
            return tokens[2] if len(tokens) >= 3 else ''
    else:
        return tweet
    
def preprocessing_pipeline(df):
    df.dropna(subset='text', inplace=True)
    df['text'] = df['text'].apply(replace_bad_chars).apply(remove_retweet_mark)
    df.drop_duplicates(subset='text', inplace=True)
    df = df[df['text'].apply(consists_of_allowed_chars)]
    return df

In [99]:
import pycld2 as cld2

def compute_word_fraction(s):
    return sum(map(len, [[c for c in t if c.isalpha()] for t in s.split() if not t.startswith(('#', '@', 'http')) and t[0].isalpha()])) / len(s)

def compute_average_token_length(s):
    tokens = s.split(); return sum(map(len, tokens)) / len(tokens) if len(tokens) > 0 else 0

def compute_english_score(s):
    t = cld2.detect(s, hintTopLevelDomain='en', hintLanguage='en')[2][0]; return 0 if t[1] != 'en' else t[3]

def add_stats(df):
    """Inplace function"""
    df['length'] = df['text'].str.len()
    df['word fraction'] = df['text'].apply(compute_word_fraction)
    df['token length'] = df['text'].apply(compute_average_token_length)
    df['english score'] = df['text'].apply(compute_english_score)
    return df

def trim_dataset_by(
        dataset, 
        sort_by=None, 
        ascending=True, 
        length=(None, None), 
        alphabet_fraction=(None, None), 
        word_fraction=(None, None),
        token_length=(None, None),
        english_score=(None, None),
        custom_metrics=None,
        verbose=True,
    ):
    df = dataset.copy()
    n0 = len(df)
              
    if length[0] is not None: df = df[df['length'] >= length[0]]
    if length[1] is not None: df = df[df['length'] <= length[1]]
    
    if word_fraction[0] is not None: df = df[df['word fraction'] >= word_fraction[0]]
    if word_fraction[1] is not None: df = df[df['word fraction'] <= word_fraction[1]]
    
    if token_length[0] is not None: df = df[df['token length'] >= token_length[0]]
    if token_length[1] is not None: df = df[df['token length'] <= token_length[1]]
    
    if english_score[0] is not None: df = df[df['english score'] >= english_score[0]]
    if english_score[1] is not None: df = df[df['english score'] <= english_score[1]]
    
    if custom_metrics is not None:
        for label, a, b in custom_metrics:
            if a is not None: df = df[df[label] >= a]
            if b is not None: df = df[df[label] <= b]
    
    n1 = len(df)
    if verbose: print(f'samples: {n1}, reduction: {((n0 - n1) / n0):.2%}')
    return df.sort_values(sort_by, ascending=ascending)

In [100]:
import multiprocessing

import numpy as np
from rapidfuzz.fuzz import ratio

from utils import start_time, elapsed_time

_strings = None
_n_strings = None
_similarity = None

def flatten(xss):
    return [x for xs in xss for x in xs]

def find_most_similar(i):
    ss = [ratio(_strings[i], _strings[j]) for j in range(i + 1, _n_strings)]
    return [i + 1 + j for j in range(len(ss)) if ss[j] >= _similarity]

def drop_similar(df, max_similarity=0.8):
    global _strings, _n_strings, _similarity
    _strings = df['text'].tolist()
    _n_strings = len(_strings)
    _similarity = max_similarity * 100
    
    start_time()
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    out = pool.map(find_most_similar, range(_n_strings - 1))
    similar_ids = set(flatten(out))
    elapsed_time(verbose=True)
    
    ids = set(range(_n_strings)) - similar_ids
    df = df.reset_index(drop=True).iloc[list(ids)]
    diff = _n_strings - len(df)
    print(f'samples: {len(df)}, similar samples detected: {diff} ({diff / _n_strings:.2%})')
    
    return df

# Climate change

Load datset

In [101]:
df = load_dataset('climate_change.csv', source=True, encoding='ISO-8859-1')
df.rename(columns={'message': 'text', 'tweetid': 'id'}, inplace=True)
df = select_columns(df, ['text', 'id'])
df.shape

(43943, 2)

Explore tweets (by repeatedly running the next cell)

In [102]:
df.sample(5)

Unnamed: 0,text,id
15305,esok presentation bi gp \naq tntang global warming.. \nwish me luck.. ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½,840904112195551232
14829,RT @JordanUhl: #EPA Chief Pruitt claims 'tremendous debate' over whether CO2 causes global warming.\n\nNo debate. Just misinformatioâ¦,840255456245383173
41889,"Scientists, investors seek to identify financial risks of climate change #environment",732978742008512512
40720,Viking M. Services: Geothermal heating -- heat pumps and global warming https://t.co/XXzbMnIgfT https://t.co/BztbrVWoKx,697024720139452417
941,The Vita Green Impact Fund is exactly the right investment vehicle for #SDGs &amp; climate change mitigation https://t.co/FgvujJdfjU,793842057655775232


Fix some encoding issues

In [103]:
df = preprocessing_pipeline(df)
df.shape

(25342, 2)

Remove noisy tweets using tweet length, english alphabet share and confidence that the language is English

In [104]:
add_stats(df);

In [105]:
a = trim_dataset_by(
    df, 
    
    # sort_by='length',
    # sort_by='word fraction',
    # sort_by='token length',
    sort_by='english score',
    ascending=True,
    
    length=(60, None),
    word_fraction=(0.5, None),
    token_length=(4, 10),
    english_score=(700, None),
)
a.drop('id', axis=1)[:60]

samples: 18234, reduction: 28.05%


Unnamed: 0,text,length,word fraction,token length,english score
24823,"Late Pleistocene-Holocene vegetation and climate change in the Middle Kalahari, Lake Ngami, Botswana\nhttps://t.co/cAvWNIgbXE @scott_louis",137,0.620438,8.2,700.0
23028,@Philosocrat Oh I believe in climate change. I just don't believe in Delaware.,78,0.653846,5.076923,700.0
30626,Study finds that global warming exacerbates refugee crises | John Abraham https://t.co/KlTmfLCjAd https://t.co/90rdZL1ths,121,0.512397,8.384615,700.0
894,".@OfficialJLD endorses Hillary Clinton, slams Donald Trump over climate change",79,0.696203,6.9,700.0
11257,"Mike Pompeo, Trump's CIA pick, evades questions about climate change and global instability https://t.co/TPzl33TqrV https://t.co/BHYxZFpNBs",139,0.546763,8.333333,700.0
25188,Peru builds up wetland resilience to reinforce indigenous response to climate change https://t.co/x0XAxzKn2j https://t.co/58oYUWGVTz,132,0.55303,8.5,701.0
30668,Study finds that global warming exacerbates refugee crises | John Abraham https://t.co/9FDzShx65z,97,0.639175,7.166667,701.0
1749,"Ralph Cicerone, former UC Irvine chancellor who studied the causes of climate change, dies at 73 - Los Angeles Times https://t.co/fqGVQGVi80",140,0.657143,5.714286,701.0
40678,"Unanticipated consequence of climate change: disease carrying mosquitoes & tics spread north. #Dengue, #Zika, #Lymedisease, #Chikungunya etc",140,0.585714,7.294118,701.0
30152,"Eying vulnerability, Calif. studying how climate change will affect state highways https://t.co/UoI0H4Q4oh https://t.co/Tju0OGHQV4",130,0.538462,9.076923,701.0


Drop similar tweets (it is expensive procedure, that is why we are running it in last order)

In [106]:
df = drop_similar(supersample(a, CLIMATE_CHANGE), max_similarity=0.8)

time elapsed: 8s
samples: 10236, similar samples detected: 764 (6.95%)


Add topic column, sample the pre-defined number of tweets, and create the dataset

In [107]:
df = select_columns(df.copy(), ['text', 'id'])
df = sample(df, CLIMATE_CHANGE)
df['topic'] = 'climate change'
dataset = df.copy()
df.shape

(10000, 3)

# Self-driving cars

In [108]:
df = load_dataset('selfdriving_cars.csv', source=True, encoding='ISO-8859-1')
df = select_columns(df, ['text', 'sentiment'])
df.drop_duplicates(subset='text', inplace=True)
df['text'] = df['text'].apply(remove_invalid_chars)
df.shape

(7146, 2)

Remove unrelated (noisy) tweets

In [109]:
df = df[df['sentiment'] != 'not_relevant']
df = select_columns(df, ['text'])
df.shape

(6933, 1)

Explore dataset

In [110]:
df.sample(5)

Unnamed: 0,text
5122,"We invented the car and now, we're on the verge of introducing the self-driving car. See our breakthroughs: http://t.co/EDurd8e5p0"
3180,GREAT day! I learned in my workshop this AM that the Google car has FINALLY driven by our home! #atleastthelawnwascut http://t.co/xx5UywN3hH
2068,"The future is now. #google driverless car. @ Mill Valley, CA http://t.co/jFRksD1L60"
3184,"@justinvl damn, didnÌ¢âÂã¢t think of that. I could totally wax and polish the Google car!"
6218,"@kurtvarner I was joking dude, can't call it focus with self-driving cars et al. Wanna build product millions of people will love? U at HD?"


In [111]:
df = preprocessing_pipeline(df)
df.shape

(5899, 1)

In [112]:
add_stats(df);

In [113]:
a = trim_dataset_by(
    df, 
    
    # sort_by='length',
    # sort_by='word fraction',
    # sort_by='token length',
    sort_by='english score',
    ascending=True,
    
    length=(40, None),
    word_fraction=(0.5, None),
    token_length=(1, 12),
    english_score=(400, None),
)
a[:60]

samples: 4721, reduction: 19.97%


Unnamed: 0,text,length,word fraction,token length,english score
6010,@SuhelenG quionda vas a orrar tu dinero para los self driving cars coming soon 2 ur area?L.A......,98,0.653061,4.823529,455.0
1641,Burkhard Bilger: Inside Google's Driverless Car http://t.co/jNoW1cBonl via @NewYorker,85,0.505882,8.555556,474.0
461,@JimSpohrer tells a gr8 story on cab drivers askd abt driverless cars.,70,0.657143,4.916667,496.0
3141,Aerial Google self driving car @ Computer History Museum http://t.co/TLrGcELkUV,79,0.594937,7.0,498.0
4427,"In self-driving milestone, Google's autonomous cars are mastering city streets http://t.co/c6WpyiwLXy http://t.co/2IbOmkdCMv",124,0.532258,9.416667,499.0
1135,Sergey Brin unveils new Google driverless car prototype http://t.co/CVON3FADCL,78,0.615385,7.777778,505.0
3764,"Autonomous driving solutions, Lexus smart co pilot. http://t.co/Iqo0NGr2",72,0.597222,8.125,512.0
2396,Google car @ Computer History Museum http://t.co/jApPaWK2rb,59,0.508475,7.571429,512.0
2564,Oh hey Google Car! @ MDT Metromover - Bayfront Park Station http://t.co/4hCAG7Lr,80,0.575,5.75,519.0
959,"Brilliant Smithsonian magazine award winners. Here w/Sebastian Thrun - driverless cars, Udacity. http://t.co/AtPH3FsV",117,0.683761,8.076923,525.0


In [114]:
df = drop_similar(supersample(a, SELFDRIVING_CARS));

time elapsed: 1s
samples: 4497, similar samples detected: 224 (4.74%)


In [115]:
df = select_columns(df.copy(), ['text'])
df['topic'] = 'self-driving cars'
df['id'] = 0

df = sample(df, SELFDRIVING_CARS)
dataset = concat(dataset, df)
df.shape

(4497, 3)

# Weather

In [116]:
df = load_dataset('weather.csv', source=True)
df.rename(columns={'tweet_text': 'text', 'tweet_id': 'id'}, inplace=True)
df = select_columns(df, ['text', 'id', 'sentiment'])
df.shape

(1000, 3)

In [117]:
a = df.copy()
a = a[a['sentiment'] != 'Tweet not related to weather condition']
a = preprocessing_pipeline(a)
a.shape

(762, 3)

In [118]:
df = select_columns(a, ['text', 'id'])

In [119]:
df.sample(5)

Unnamed: 0,id,text
870,84305541,Am I the only person who heard that loud thundering storm last night or was that just me?
257,84050868,"Me and katilyn chatting up the movie, walk outside.. OMG. Sprint to car. We don't like thunder storms. #mature"
464,81992074,Wednesday - Snow showers. Snow accumulation 3 to 5 inches. Highs in the 30s. {link}
105,82839115,"News - Current Conditions : 64.4F, Heavy Thunderstorms and Rain - 7:07 PM EDT May. 16: Temperat... {link} #News #Greensboro"
711,84050258,Its freezing in Boston hows the weather in Providence???


The dataset has already been processed. Author replaced any mentions by @metion and links by {links}. I'd like to replace {link} by %link

In [120]:
df[df['text'].str.contains('{link}')].shape

(152, 2)

In [121]:
df['text'] = df['text'].str.replace('{link}', '%link', regex=False)
df[df['text'].str.contains('{link}')].shape

(0, 2)

In [122]:
add_stats(df);

In [123]:
a = trim_dataset_by(
    df, 
    
    ascending=True,
    sort_by='length',
    # sort_by='word fraction',
    # sort_by='token length',
    # sort_by='english score',
    
    length=(15, None),
    word_fraction=(0.6, None),
    # token_length=(1, 12),
    english_score=(800, None),
)
a[:60]

samples: 595, reduction: 21.92%


Unnamed: 0,id,text,length,word fraction,token length,english score
435,84049777,It's 82 degrees,15,0.666667,4.333333,2441.0
519,81993022,Its Freezing D;,15,0.8,4.333333,1706.0
843,83269790,What up sunshine,16,0.875,4.666667,1927.0
125,80052225,Morning sunshine.,17,0.882353,8.0,1505.0
401,84031161,Is it hot outside?,18,0.777778,3.75,2730.0
934,82509653,Really hot outside,18,0.888889,5.333333,2209.0
54,84034743,It's fuckin hot out,19,0.789474,4.0,1792.0
613,84050364,Its hot af outside.,20,0.75,4.0,1940.0
266,84308267,My room is freezing,20,0.8,4.0,1792.0
509,84318313,Its a million degrees,21,0.857143,4.5,2001.0


In [124]:
df = drop_similar(a);

time elapsed: 0s
samples: 587, similar samples detected: 8 (1.34%)


In [125]:
df = select_columns(df.copy(), ['text', 'id'])
df['topic'] = 'weather'
df['id'] = 0

df = sample(df, WEATHER)
dataset = concat(dataset, df)
df.shape

(587, 3)

# Airline support

In [126]:
df = load_dataset('airline_support.csv', source=True, encoding='ISO-8859-1')
df = select_columns(df, ['text'])
df.shape

(14640, 1)

In [127]:
df = preprocessing_pipeline(df)
df.shape

(13599, 1)

In [128]:
df.sample(5)

Unnamed: 0,text
10812,"@USAirways and next flight delayed as well. 5238 out of Charlotte, is it just my luck or what?"
10755,@USAirways thank you
2442,@united let me assure you my travel time is 4h2m not 5h2m #timezones #accuratetraveltimes #3rdtimethishashappened http://t.co/e0C9bI09cf
6615,@SouthwestAir Hey SWAir- a very tortured boarding on flight 1971 due to one gate agent having to board assisted folks one at a time.
6411,@SouthwestAir I'm Flight Booking Problems a flight to Vegas. Any good promo codes??


In [129]:
add_stats(df);

In [130]:
a = trim_dataset_by(
    df, 
    
    # sort_by='length',
    sort_by='word fraction',
    # sort_by='token length',
    # sort_by='english score',
    ascending=True,
    
    length=(70, None),
    word_fraction=(0.6, None),
    token_length=(None, 7),
    english_score=(650, None),
)
a[:60]

samples: 8509, reduction: 37.43%


Unnamed: 0,text,length,word fraction,token length,english score
13043,"@AmericanAir hey ho its not me losing any money (only you) just next make sure you stick to the ""flyers right's booklet""",120,0.6,4.5,1324.0
14264,"@AmericanAir Still sitting in airport as 6:05 flt delayed until 6:55...but still not boarding, what is happening? No info....",125,0.6,5.631579,1033.0
5875,"@SouthwestAir my trip is a month away...why do you consider it ""around the corner""? Is that a setting I can change?",115,0.6,4.52381,1287.0
12875,"@AmericanAir Yes, thanks I found those, didn't see the gray tab at first :)",75,0.6,4.428571,1199.0
7986,"@JetBlue No, the flight wasn't until 9:51pm, but it's already been delayed.",75,0.6,5.333333,1536.0
5192,@SouthwestAir reservation (FEHQNE) 21FEB15 | DCA-RSW. Want refund not credit for Cancelled Flightled flight please.,115,0.6,6.733333,984.0
358,"@VirginAmerica just promoting the product is all, had a problem with southwest and recommend noneother than the best! http://t.co/tFaNXBh1Cf",140,0.6,6.421053,1106.0
88,@VirginAmerica I &lt;3 Flying VA But Life happens and I am trying to #change my trip JPERHI Can you help.VA home page will not let me ?,135,0.6,4.037037,975.0
8025,@JetBlue has the worst customer service out of any airline company. #dontflythem,80,0.6,5.75,958.0
6729,@SouthwestAir the last 4 times I've arrived @LASairport our gate has been blocked by a slow to depart plane leading to 30-60 min delays,135,0.6,4.666667,926.0


In [131]:
df = drop_similar(supersample(a, AIRLINE_SUPPORT));

time elapsed: 6s
samples: 8468, similar samples detected: 41 (0.48%)


In [132]:
df = select_columns(df.copy(), ['text'])
df['topic'] = 'airline support'
df['id'] = 0

df = sample(df, AIRLINE_SUPPORT)
dataset = concat(dataset, df)
df.shape

(8468, 3)

# FIFA World Cup

In [133]:
df = load_dataset('fifa_world_cup.csv', source=True)
df.rename(columns={'Tweet': 'text'}, inplace=True)
df = select_columns(df, ['text'])
df.shape

(22524, 1)

In [134]:
df = preprocessing_pipeline(df)
df.shape

(17341, 1)

In [135]:
df.sample(5)

Unnamed: 0,text
6989,Anyone got odds on Qatar to win 1-0 with a comedy last minute own goal and Ecuador to have 8 goals disallowed...? Asking for a Qatari Sheikh.... #WorldCup2022 #FifaWorldCup
7981,#WorldCup2022 Be There!! https://t.co/wPL9L4gjDW
11956,And the headlines are already out in India!! JK you were amazing!! @BTS_twt #FIFAWorldCup #WorldCup2022 #jungkook #FIFAWorldCup2022 https://t.co/dCjuvv4OQg
4446,After watching #FIFAUncovered I have trust issues. Valencia is giving me hope though. 😅\n #QatarEcuador\n#WorldCup2022
7914,I'm saying it now match fixing for Qatar #WorldCup2022


In [136]:
add_stats(df);

In [137]:
a = trim_dataset_by(
    df, 
    ascending=True,
    
    # sort_by='length',
    length=(55, None),
    
    # sort_by='word fraction',
    word_fraction=(0.5, None),
    
    # sort_by='token length',
    token_length=(4, 9),
    
    sort_by='english score',
    english_score=(750, None),
)
a[:60]

samples: 7041, reduction: 59.40%


Unnamed: 0,text,length,word fraction,token length,english score
4495,Valencia 2 goals already. Ecuador lead 0-2 #WorldCup2022,56,0.553571,6.125,750.0
5849,"Enner Valencia took 9 minutes to resettle, then told VAR to choke on its match-fixiness. #WorldCup2022 https://t.co/m1FYpEvlzL",126,0.555556,6.470588,750.0
3356,Qatar seff go collect woto woto for this competition 😂😂\n#WorldCup2022 #Qatar2022,80,0.55,5.75,752.0
19771,Predict who will be the quarter finalists and win free FIFA tokens.\n\n#FIFAWorldCup #WorldCup #WorldCup2022,106,0.518868,6.066667,752.0
9663,Course that Ecuador goal was offside 😭🤣🤑 #QATECU #WorldCup2022,62,0.5,6.0,753.0
1648,Fans chant for Palestine and raise Palestinian flags during a live broadcast of an Israeli channel from Qatar. #WorldCup2022. #RaisePalestineFlag https://t.co/8Z955J85Ty,169,0.544379,7.095238,754.0
12827,Clint Dempsey calling out Alexi Lawless on live TV. He been a hater 🤣🤣 #WorldCup2022,84,0.642857,4.666667,754.0
10122,Ecuador scores the first world cup goal in less than 3 minutes. 🤯 #WorldCup2022 #WorldcupQatar2022,98,0.510204,5.6,755.0
4213,Mina I even forgot kune #WorldCup2022 yaz.. 😴 Anyway @netflix ingiphethe sharp.. 😉,82,0.52439,5.384615,756.0
868,"COLOMBIAN SINGER MALUMA:\n\nThe star features alongside Nicki Minaj and Myriam Fares on Tukoh Taka, the official Fifa Qatar World Cup anthem. ^MA\n#FIFAWorldCup\n#WorldCup2022 #LOOKUPTV https://t.co/J9yQg2aSjH",205,0.556098,6.592593,757.0


In [138]:
df = a

Randomly remove hashtag #WorldCup2022 to complicate topic discovery

In [139]:
df = randomly_remove_substring(df, '#WorldCup2022', p=0.7)

"#WorldCup2022" new frequency: 0.29


In [140]:
df = drop_similar(supersample(a, FIFA_WORLD_CUP));

time elapsed: 6s
samples: 6854, similar samples detected: 187 (2.66%)


In [141]:
df = select_columns(df.copy(), ['text'])
df['topic'] = 'fifa world cup'
df['id'] = 0

df = sample(df, FIFA_WORLD_CUP)
dataset = concat(dataset, df)
df.shape

(6854, 3)

# Australian elections

In [142]:
df = load_dataset('australian_elections.csv', source=True)
df.rename(columns={'full_text': 'text', 'favorite_count': 'likes'}, inplace=True)
df = select_columns(df, ['text', 'id'])
df.shape

(183379, 2)

In [143]:
a = df.copy()

a['text'] = a['text'].apply(replace_bad_chars)
a.drop_duplicates(subset='text', inplace=True)
a = a[a['text'].apply(consists_of_allowed_chars)]

a.shape

(165029, 2)

In [144]:
df = a

In [145]:
add_stats(df);

In [146]:
a = trim_dataset_by(
    df, 
    ascending=True,
    
    # sort_by='length',
    length=(70, None),
    
    # sort_by='word fraction',
    word_fraction=(0.7, None),
    
    # sort_by='token length',
    token_length=(3.5, 8),
    
    sort_by='english score',
    english_score=(1000, None),
)
a[:60]

samples: 21289, reduction: 87.10%


Unnamed: 0,id,text,length,word fraction,token length,english score
118115,1128160964199608320,"Liberal candidates thinks only heterosexual couples should be able to adopt, that carbon emissions aren't pollution and that marriage stops domestic violence.\n\nThese deranged candidate are here to do one job. Kill the working class.\n#auspol #ausvotes\nhttps://t.co/i49LmqVcqp",274,0.70073,6.210526,1000.0
45521,1129923506010038272,"Morrison is on record as saying labour laws need reform so teenagers can get workplace experience. He's also pro-coal. We'll see kids working in the mines, just like Dickensian England, within 12 months. Mark my words. #auspol",226,0.761062,5.135135,1000.0
163798,1129679835993788417,Our student panelists are discussing are worrying trend of candidates making Islamophobic and anti-immigration comments #ausvotes,129,0.806202,7.125,1000.0
90300,1129148682849251330,"It's not an efficiency dividend, it's a maladministration and corruption dividend #auspol",89,0.764045,6.5,1000.0
50832,1129802158369517568,Let's try to find a few positives in this terribly disappointing result.\nAbbott - gone.\nAnning - no seat.\nGovt - precarious majority.\nA few of those QLD dudes don't look very physically healthy. \nMorrison will have to own the tanking economy.\nMight not last 3 years.\n#auspol,274,0.751825,4.708333,1000.0
94457,1128966209729253376,RIP Bob Hawke. Using his last days to promote his vision for a better Australia. And heal old rifts. And generally be a good man. #auspol,137,0.737226,4.307692,1000.0
153896,1129741496985509892,i can’t believe the twitter emoji for the australian federal election is a fucking sausage on bread this is so humiliating #ausvotes,132,0.765152,5.045455,1000.0
31976,1130382740468244480,"Ten years ago, religion barely featured in daily public life/ discussion. In a country where over 60% of us claim a religious affiliation, there was a general acceptance that religion was a person’s private matter.\n\nHow times have changed. #auspol https://t.co/FactA8Tsaj",271,0.701107,5.609756,1000.0
166204,1129646190813896706,"Was going for a run in Kooyong before, Greens corflutes plastered over trees in a bush land (Kew Boulevard) with climate change & looking after the environment. Some irony #AUSVotes",181,0.745856,5.066667,1000.0
2680,1130148524606124032,"@clairegcoleman Why do you blame Australia when clearly nearly 50% didn't vote for those things. Morrison(LNP), Palmer & Poorlean were instrumental in making it happen preferentially.",183,0.726776,6.076923,1000.0


In [147]:
df = drop_similar(supersample(a, AUSTRALIAN_ELECTIONS));

time elapsed: 18s
samples: 10907, similar samples detected: 93 (0.85%)


In [148]:
df = a
s = '#coronavirus'
print('string frequencies:')

s = '#auspol'; print(f'{s}: {string_frequency(df, s):.2f}')
s = '#ausvotes'; print(f'{s}: {string_frequency(df, s):.2f}')
s = '#australiavotes'; print(f'{s}: {string_frequency(df, s):.2f}')

string frequencies:
#auspol: 0.56
#ausvotes: 0.30
#australiavotes: 0.01


In [149]:
df = randomly_remove_substring(df, '#auspol', p=0.8, case=False)
df = randomly_remove_substring(df, '#ausvotes', p=0.5, case=False)

"#auspol" new frequency: 0.11
"#ausvotes" new frequency: 0.15


In [150]:
df.sample(5)

Unnamed: 0,id,text,length,word fraction,token length,english score
23869,1128727062066409472,"@tedcruz Maybe we worry about our election booths first? Speaking of elections, have you heard that @Australia has mandatory voting? What a concept.",148,0.702703,5.478261,1024.0
9118,1129838671824343042,Is everybody back home in Australia awake and watching #Eurovision because of the soul crushing insomnia after the election?,124,0.758065,5.578947,1165.0
49360,1129869619391475712,You just voted the Liberal party back in. It's like winning a lottery in reverse.,89,0.719101,4.625,1188.0
7014,1129891832580624384,"This election was won by big bold lies, media amplifying them, pressuring Labor based on false premises & $60M from Palmer. Well and good to gloat but the idea that this was a win for the people of Australia is entirely false. Wait for the paybacks to Murdoch & Palmer #insiders",278,0.751799,4.470588,1242.0
35730,1130274918589845504,I must be misunderstanding Joel Fitzgibbon. He thinks the ALP has to move to the right to find the centre?\n,114,0.745614,4.47619,1033.0


In [151]:
df = randomly_remove_substring(df, '#auspol', p=0.6)

"#auspol" new frequency: 0.04


In [152]:
df = randomly_remove_substring(df, '#ausvotes', p=0.2)

"#ausvotes" new frequency: 0.10


In [153]:
df = select_columns(df.copy(), ['text', 'id'])
df['topic'] = 'australian elections'

df = sample(df, AUSTRALIAN_ELECTIONS)
dataset = concat(dataset, df)
df.shape

(10000, 3)

# COVID19

In [154]:
df1 = load_dataset('covid_train.csv', source=True, encoding = 'ISO-8859-1')
df2 = load_dataset('covid_test.csv', source=True)
df = pd.concat([df1, df2])
df.rename(columns={'OriginalTweet': 'text', 'Sentiment': 'sentiment'}, inplace=True)
df = select_columns(df, ['text', 'sentiment'])
df.shape

(44955, 2)

In [155]:
df = preprocessing_pipeline(df)
df.shape

(36980, 2)

In [156]:
add_stats(df);

In [157]:
a = trim_dataset_by(
    df, 
    ascending=True,
    
    # sort_by='length',
    length=(60, None),
    
    sort_by='word fraction',
    word_fraction=(0.60, None),
    
    # sort_by='token length',
    token_length=(4, 8),
    
    # sort_by='english score',
    english_score=(900, None),
)
a[:60]

samples: 19558, reduction: 47.11%


Unnamed: 0,text,sentiment,length,word fraction,token length,english score
33054,"https://t.co/2yMUPatIYU Wear a mask! Especially in situations where you may be around people, like at the grocery store or pharmacy. Make your own mask, decorate it and have fun with your designs, make something cute and friendly. #CoronaVirus #FlattenTheCurve #StopTheSpread",Extremely Positive,275,0.6,5.9,1361.0
8396,Here are the facts about Coronavirus according to Consumer Reports: \r\r\n https://t.co/47fybkzW5t,Neutral,95,0.6,7.363636,1024.0
13225,Is not a panic buying because of the global #coronavirus pandemic. Is about #Walmart online shopping and their politics of not products available. https://t.co/ZIJHJqGQ9F,Positive,170,0.6,6.125,1154.0
24459,"#2020Is traveling with alcohol swabs in your pocket so that you can wipe down every keypad you have to interact with at the ATM, supermarket, and pharmacy, which are the only places you can still go. #StayAtHome #StayHome #Covid_19 https://t.co/J2Axi63CxJ",Neutral,255,0.6,5.4,1342.0
13267,"COVID-19, has just over 1,372 cases in the UK, and 35 deaths\r\r\n\r\r\nCANCER has around 1000 new cases every day and 450 deaths every day, (that's a 45% death rate) so why is that not causing panic buying and empty Supermarket shelves? https://t.co/gxq3Ai8Pma",Extremely Negative,255,0.6,4.837209,1418.0
34018,"We are excited to announce Pay-What-You-Can prices for our virtual event Self-Esteem: Coping in Collective Isolation, on April 16th. We recognize these are hard times and still want everyone to feel welcome. Info @ https://t.co/XKVI1jYAMT\r\r\n\r\r\n#coronavirus #mentalhealth #selfcare",Positive,280,0.6,6.263158,1296.0
3610,Sainsbury: Give the elderly the first hour of supermarket trading during Covid-19 - Sign the Petition! https://t.co/sqPmSCMbGB via @UKChange,Neutral,140,0.6,6.421053,1070.0
7828,For an ongoing display of sheer idiocy check your local supermarket #coronavirus #coronavirusuk,Neutral,95,0.6,6.384615,1013.0
34633,"As the #Covid_19 crisis pushes the food business into unkown terrain, the fragility & flaws in our globalised food systems are revealed..the longer the supply networks, the more vulnerable they become. We need 'place based' food economies. #FoodSystems https://t.co/SlbARl2L19 https://t.co/mzZyRxwql5",Extremely Negative,300,0.6,6.525,1148.0
24838,Look at these cheap oil and gas prices when I have no where to go..\r\r\n\r\r\n#coronavirus,Negative,85,0.6,4.0625,1365.0


In [158]:
df = drop_similar(supersample(a, COVID19));

time elapsed: 18s
samples: 10809, similar samples detected: 191 (1.74%)


In [159]:
df = a
s = '#coronavirus'
print('string frequencies:')

s = 'coronavirus'; print(f'{s}: {string_frequency(df, s):.2f}')
s = 'covid'; print(f'{s}: {string_frequency(df, s):.2f}')
s = 'covid19'; print(f'{s}: {string_frequency(df, s):.2f}')

s = '#coronavirus'; print(f'{s}: {string_frequency(df, s):.2f}')
s = '#corona'; print(f'{s}: {string_frequency(df, s):.2f}')
s = '#covid'; print(f'{s}: {string_frequency(df, s):.2f}')
s = '#covid19'; print(f'{s}: {string_frequency(df, s):.2f}')

string frequencies:
coronavirus: 0.26
covid: 0.47
covid19: 0.05
#coronavirus: 0.22
#corona: 0.24
#covid: 0.16
#covid19: 0.05


In [160]:
df = randomly_remove_substring(df, '#coronavirus', p=0.8, case=False)
df = randomly_remove_substring(df, '#covid19', p=0.8, case=False)

"#coronavirus" new frequency: 0.04
"#covid19" new frequency: 0.01


In [161]:
df.sample(5)

Unnamed: 0,text,sentiment,length,word fraction,token length,english score
33744,Thank you to our hidden heroes They continue to keep shelves stocked and packages delivered amid 19 From the grocery store clerks to truck drivers pharmacy staff and delivery drivers we thank you for helping our communities,Extremely Positive,223,0.829596,5.054054,1264.0
261,Social distancing to prevent the spread of coronavirus could have devastating effect on people with depression. https://t.co/rlJ1MaaxgI,Extremely Negative,135,0.703704,7.0,977.0
26559,Global surveys of consumer sentiment during the coronavirus crisis,Extremely Negative,66,0.878788,6.444444,1054.0
23751,"I have family on SSI and Social Security. They have had to stock up on food, pay for people to bring them food, supplies, go get their meds. It is not fair to say they are not American because they are living in poverty or cannot go places because of #coronavirus #socialsecurity",Negative,279,0.702509,4.384615,1384.0
9566,Great to see some positives coming from #coronavirus people are now wanting to try growing their own food #foodsecurity #growyourown Bunnings sold out of seedling stock and are trying to keep up. Hopefully people continue this way of life,Extremely Positive,238,0.680672,5.128205,1442.0


In [162]:
df = select_columns(df.copy(), ['text'])
df['topic'] = 'covid19'
df['id'] = 0

df = sample(df, COVID19)
dataset = concat(dataset, df)
df.shape

(10000, 3)

# ChatGPT

In [163]:
df = load_dataset('chatgpt.csv', source=True)
df.rename(columns={'content': 'text', 'like_count': 'likes', 'retweet_count': 'retweets'}, inplace=True)
df = select_columns(df, ['text', 'id', 'likes', 'retweets'])  # 
df.shape

(500036, 4)

In [164]:
a = df.iloc[:200000].copy()
a = preprocessing_pipeline(a)
a.shape

(172439, 4)

In [165]:
df = a;
add_stats(df);

In [166]:
a = trim_dataset_by(
    df, 
    ascending=True,
    
    # sort_by='length',
    length=(80, None),
    
    # sort_by='word fraction',
    word_fraction=(0.70, None),
    
    # sort_by='token length',
    token_length=(4, 8),
    
    # sort_by='english score',
    english_score=(900, None),
    
    # sort_by='likes',
    sort_by='retweets',
    custom_metrics=[
        ['likes', None, None],
        ['retweets', None, None]
    ]
)
a[:60]

samples: 24866, reduction: 85.58%


Unnamed: 0,id,text,likes,retweets,length,word fraction,token length,english score
4,1641213003260633088,"Most people haven't heard of Chat GPT yet.\nFirst, elite factions will decide which way to go on AI safety. Next they will push their agenda(s) on the public with misleading and oversimplified media presentations. Finally, the brainless Red and Blue camps will screech their lines",0.0,0.0,279,0.810036,5.086957,1296.0
117992,1635801899348656129,if i use chat gpt and dall-e to create a completely original person's online presence and then post it all over the internet. what are the odds i find an almost identical person?,1.0,0.0,178,0.797753,4.424242,1220.0
117985,1635802267491155969,"@Cyb3rMonk I’ll be honest, I’m struggling to make it useful for me as well but maybe I’m overthinking it.\n\nI however had a lesser experienced colleague solve an on going technical issue by using Chat GPT. The resolution provided was actually very clearly outlined and spot-on.",1.0,0.0,276,0.768116,5.0,1311.0
117982,1635802352069525505,"#ChatGPT releasing ChatGPT 4 and allowing image inputs is a game changer. \n\nCan’t wait to be able to throw in financial statements, diagrams, etc. and have it summarize or explain.",0.0,0.0,180,0.744444,4.966667,1227.0
117968,1635802927183921154,"Today GPT-4 drastically cut the half-life of research based on LLMs.\n\nMy WIP on ChatGPT prompt engineering for long essays is rendered moot just 3mos after model early release.\n\nBeyond SOTA models themselves, what research of lasting value can be done?\n\nhttps://t.co/NZdWa6bbgV",0.0,0.0,277,0.718412,5.547619,1189.0
117949,1635803283179843584,"I spent some time to quickly scan the #gpt4 paper, I wonder how to implement it, is it possible to rebuild a #ChatGPT with a high performance computer?",0.0,0.0,151,0.715232,4.428571,1129.0
117947,1635803312212815873,"ChatGPT (GPT-4) still has significant limitations for research (especially legal). It tends to make major factual errors in citations and case summaries.\n\nCurrently, it's only useful for document drafting, but document drafting mostly uses templated letters anyway.",7.0,0.0,265,0.766038,6.162162,1374.0
117935,1635803635186556930,"I believe prompt engineering is something in the future.\n\nI can't believe that there is still people learning in an old fashion way rather than using ChatGPT while learning how to make better prompt.\n\nTon of SAAS is using GPT tech, and it is just the beginning of prompt engineer.",0.0,0.0,280,0.8,4.58,1463.0
117930,1635803831278641159,"A new version of the AI system that powers the popular chatbot has better language skills, but is still biased, prone to fabrication, and can be abused.GPT-4 Will Make ChatGPT Smarter but Won't Fix Its Flaws https://t.co/fsd8kGWyBi",0.0,0.0,231,0.714286,5.27027,1206.0
117994,1635801833334784000,"@rSanti97 Every interpretive ask I've given chat gpt has been mediocre at best, and these are broad ""discuss the themes of ____"" I presume the more specific questions about exact phrasing would trip it up very quickly.",0.0,0.0,218,0.720183,4.918919,1117.0


In [167]:
def subset_contains(df, ss, **kwargs):
    return df[df['text'].apply(lambda s: contains_substrings(s, ss, **kwargs))]

subset_contains(a, ['#chatgpt'], case=False).shape

(9268, 8)

In [168]:
df = randomly_remove_substring(a, '#chatgpt', p=0.8, case=False)

"#chatgpt" new frequency: 0.07


In [169]:
df = drop_similar(supersample(df, CHATGPT));

time elapsed: 18s
samples: 10877, similar samples detected: 123 (1.12%)


In [170]:
df.sample(5)

Unnamed: 0,id,text,likes,retweets,length,word fraction,token length,english score
8880,1636973523284512768,"@amli_art Wait next week. It's not guaranteed, but we might see a large language model (the same AI system that powers ChatGPT or GPT-4) that is truly open and designed to be trainable with your content. If that happens, in weeks you'll have the system you need from the AI community 4 free",1.0,0.0,290,0.741379,4.490566,1287.0
3614,1638083128890646529,I have 50 developers in my team who work on a large set of distributed applications and machine learning services that contain complex business logic to control and automate our logistics processes. #chatGPT could not come up with a single useful LOC here. So why bother with it?,0.0,0.0,279,0.784946,4.833333,1357.0
7792,1632700607311265794,"@unusual_whales Chat GPT is great with words but not good at all with crunching accurate numbers, so I think the financial analysts / bean counters are safe",0.0,0.0,156,0.724359,4.814815,1438.0
4727,1637084131984121859,@elonmusk I asked chat gpt Ai some questions and the answers were definitely bias politically and answered very pro pharma. Chat Gpt is bull sheeeet.,0.0,0.0,149,0.765101,5.0,1058.0
5174,1637288294227804160,"@wretchardthecat Isn't the idea that AI is objectively truthful already dead and buried though? Because we have many, many examples of Chat GPT simply making up facts in the middle of detailed responses. In one case I read about it went so far as to fabricate the existence of medical studies!",0.0,0.0,293,0.757679,4.764706,1272.0


In [171]:
df = select_columns(df.copy(), ['text', 'id'])
df['topic'] = 'chatgpt'

df = sample(df, CHATGPT)
dataset = concat(dataset, df)
df.shape

(10000, 3)

# Stock Market Crash

In [172]:
df = load_dataset('stock_market_crash.csv', source=True)
df = select_columns(df, ['text', 'id'])
df.shape

(33946, 2)

In [173]:
df = preprocessing_pipeline(df)
df.shape

(28813, 2)

In [174]:
add_stats(df);

In [175]:
a = trim_dataset_by(
    df, 
    # ascending=False,
    ascending=True,
    
    sort_by='length',
    length=(70, None),
    
    # sort_by='word fraction',
    word_fraction=(0.55, None),
    
    # sort_by='token length',
    token_length=(4, 8),
    
    # sort_by='english score',
    english_score=(850, None),
)
a[:60]

samples: 10274, reduction: 64.34%


Unnamed: 0,id,text,length,word fraction,token length,english score
28563,1524336760536322050,"$GMT rekt, another blabla2earn ponzi scam is in mud in the #bearmarket",70,0.6,4.916667,1024.0
27113,1526024535140536320,"Short to medium term pessimism, long term optimism #crypto #bearmarket",70,0.6,6.1,1024.0
19924,1485625253942468621,#NFT are the only thing holding its ground right now #stockmarketcrash,70,0.557143,5.454545,1424.0
14829,1496786838723473411,Bought the dip today\n\nNiftyBees\nBankBees\nJuniorBees\n\n#stockmarketcrash,70,0.628571,7.625,993.0
24874,1533270299407486976,#NFT Collections that do well during a #bearmarket are ones to watch 👀,70,0.6,4.461538,1513.0
31460,1495114491314458633,Follow StockAlgos for more investing tips & growth tools.\n\n#BEARMARKET,70,0.671429,6.0,1101.0
14834,1496786525236838400,If you have sold today\nMy Dear that was a wrong call\n#stockmarketcrash,70,0.585714,4.461538,1316.0
10643,1522663485380993032,Wednesday was the biggest bull trap I’ve ever seen 😭 #stockmarketcrash,70,0.585714,5.454545,1219.0
23815,1536235134386638848,To note for all those new to trading you would call this a #bearmarket,70,0.657143,4.071429,1433.0
1306,1537441785722548232,Patience is the name of the game in the stock market #stockmarketcrash,70,0.6,4.916667,1301.0


In [176]:
df = a
print('string frequencies:')

s = '#stockmarketcrash'; print(f'{s}: {string_frequency(df, s):.2f}')
s = '#stockmarket'; print(f'{s}: {string_frequency(df, s):.2f}')
s = '#bearmarket'; print(f'{s}: {string_frequency(df, s):.2f}')

string frequencies:
#stockmarketcrash: 0.61
#stockmarket: 0.62
#bearmarket: 0.40


In [177]:
df = randomly_remove_substring(df, '#stockmarketcrash', p=0.5, case=False)
df = randomly_remove_substring(df, '#bearmarket', p=0.5, case=False)

"#stockmarketcrash" new frequency: 0.30
"#bearmarket" new frequency: 0.20


In [178]:
df.sample(5)

Unnamed: 0,id,text,length,word fraction,token length,english score
11457,1522562060130861058,"We can turn this around by making the government balance the budget and pay off its debts.\n\nFederal revenue is 4.2 trillion annually. We are paying 360 billion just to pay the interest on our 30+ trillion dollar debt.\n\nThey can do it, we have to make them.\n \n🙂",278,0.694245,4.5,1562.0
9545,1524060205968941056,"(3)It’s a great buying opportunity for all of us. Amidst the noise of stock market bubble and correction, there are a few great companies getting crushed.This is thus a great opportunity to for us to do our research and find such companies to invest! #stockmarketcrash #recession",279,0.702509,5.086957,1249.0
3402,1536382109232807938,You ain’t seen nuthin’ yet! Just wait a little longer and the US dollar is no longer the reserve currency. You think 5 bux a gallon for gas is bad?!? Hold my beer. You can thank Comrade Chairman Biden for the destruction of your lives. #stockmarketcrash,253,0.715415,4.521739,1394.0
20263,1538463191465861122,Hi #CryptoCommunity! 🫠\n\nWish you get through this #BearMarket2022. \nHodl on 🚀\n\nWhat #Altcoins analysis do you want to see today? Write your favorite altcoin in the comments 👇,174,0.557471,4.931034,1247.0
11380,1522572971016998913,is not a real crash in the slightest. A decline of 3-4% is really a drop in the bucket. The point amount doesn't matter.,138,0.644928,4.56,1433.0


In [179]:
df = drop_similar(supersample(df, STOCK_MARKET_CRASH));

time elapsed: 14s
samples: 10156, similar samples detected: 118 (1.15%)


In [180]:
df = select_columns(df.copy(), ['text', 'id'])
df['topic'] = 'stock market crash'

df = sample(df, STOCK_MARKET_CRASH)
dataset = concat(dataset, df)
df.shape

(10000, 3)

# Dataset

In [185]:
dataset.shape

(70406, 3)

Topic distribution

In [182]:
print(dataset.shape)
dataset.groupby('topic').nunique()

(70406, 3)


Unnamed: 0_level_0,text,id
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
airline support,8468,1
australian elections,10000,10000
chatgpt,10000,10000
climate change,10000,10000
covid19,10000,1
fifa world cup,6854,1
self-driving cars,4497,1
stock market crash,10000,10000
weather,587,1


Explore random tweets

In [183]:
dataset[['text', 'topic']].sample(5)

Unnamed: 0,text,topic
37187,You can’t win a federal election in Australia by telling the people they are going to lose money in their hip pocket,australian elections
27807,Looking forward to the bent refereeing today.,fifa world cup
59011,"@scottmelker I tried to get chat gpt to help me start a new religion for that exact reason, converting people from old shit religions to a new better one. It told me that it is disrespectful and immoral.\nI said so like a mega pastor?\nChat gpt, yes, just like a mega pastor.",chatgpt
17857,@AmericanAir How do I check? Reservation for Joe Watson and Kelsey Jennings. We were on hold for 2 hours. Waiting for call back now. ETA?,airline support
19838,"@SouthwestAir you have the worst service, you Cancelled Flightled all your flights FLL to PHL all @USAirways flights flew. Stuck in FL 3 days. #done",airline support


In [186]:
# save_dataset(dataset, 'dataset70000.csv')