# Import Libraries

In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import snscrape.modules.twitter as sntwitter
from collections import Counter

# Scraping Data

In [2]:
scraper = sntwitter.TwitterSearchScraper("Arbitrum")

tweets = []
n_tweets = 5000

for i , tweet in tqdm(enumerate(scraper.get_items()),total=n_tweets):
    data = [
        tweet.date,
        tweet.rawContent,
        tweet.cashtags,
    ]
    tweets.append(data)
    if i > n_tweets:
        break

  0%|          | 0/5000 [00:00<?, ?it/s]

In [3]:
tweet_df =pd.DataFrame(
    tweets, columns=["date","content","cashtags"]
)
tweet_df.to_csv("GRAIL Comment Scraper.csv", index=False)

In [4]:
tweet_df["cashtags"]

0       [FB, ARB, SFUND, USDT]
1                         None
2                         None
3                         None
4                   [ARB, URD]
                 ...          
4997                      None
4998                      None
4999                     [AFA]
5000                      None
5001                      None
Name: cashtags, Length: 5002, dtype: object

In [5]:
#fill null with "unknown"
#tweet_df.fillna("Unknown",inplace=True)

In [6]:
tweet_df

Unnamed: 0,date,content,cashtags
0,2023-03-27 20:58:44+00:00,🥳 Sayel Ahmed Giveaway ！💰\n\n🏆 Prize Pool : $0...,"[FB, ARB, SFUND, USDT]"
1,2023-03-27 20:58:44+00:00,@arbitrum @WombatExchange https://t.co/q718lzJGkD,
2,2023-03-27 20:58:41+00:00,@layer3xyz @arbitrum 1800,
3,2023-03-27 20:58:38+00:00,"@MINDGames_io For more information, the projec...",
4,2023-03-27 20:58:25+00:00,Exciting news for DeFi enthusiasts! Check out ...,"[ARB, URD]"
...,...,...,...
4997,2023-03-27 13:52:44+00:00,@Cryptostepper09 @AlphaSeeker21 @arbitrum Ahah...,
4998,2023-03-27 13:52:41+00:00,@aprlia_alya @Arbitrum_Space @Auradx_ai @Telko...,
4999,2023-03-27 13:52:33+00:00,@FlokiAi_Arb $AFA #ArbFlokiAI #Arbitrum #floki...,[AFA]
5000,2023-03-27 13:52:31+00:00,@BitStubs The Arbitrum airdrop is just 3 hours...,


In [7]:
# Liste der zu ignorierenden Cashtags
ignore_list = ['$BTC', '$ARB', '$ETH', "$Arb","$USDC","$arb","$BNB","$USDT","$OP","$MATIC", "$BONE", "$BLUR", "$SHIB",
               "$APE", "$EGGS", "$APT", "$FIL", "$HEX", "$DOGE", "$EVMOS", "$SHI","$10,000", "$2000+", "$10", "$5OO,OOO!",
               "$3000!", "$500", "$1500"]

# Extract cashtags from the tweet content
def extract_cashtags(text):
    words = text.split()
    return [word for word in words if word.startswith("$")]

# Apply the extract_cashtags function to each tweet and store the results in a new column
tweet_df['cashtags'] = tweet_df['content'].apply(extract_cashtags)

# Create a new DataFrame with one row for each cashtag mentioned in each tweet
rows = []
for index, row in tweet_df.iterrows():
    for cashtag in row['cashtags']:
        rows.append({'Cashtag': cashtag})

# Convert the rows to a DataFrame
cashtag_df = pd.DataFrame(rows)

# Filter out the ignored cashtags
cashtag_df = cashtag_df[~cashtag_df['Cashtag'].isin(ignore_list)]

# Count the frequency of each cashtag
cashtag_counts = Counter(cashtag_df['Cashtag'])

# Get the top 100 most common cashtags
top_cashtags = cashtag_counts.most_common(50)

# Convert the list of tuples to a DataFrame
top_cashtags_df = pd.DataFrame(top_cashtags, columns=['Cashtag', 'Frequency'])

# Export the results to Excel
top_cashtags_df.to_excel('arbi_presales_27.03.2023.xlsx', index=False)

In [8]:
print(top_cashtags_df)

    Cashtag  Frequency
0      $ZSP        167
1      $URD         62
2         $         58
3     $AiFi         41
4    $SFUND         28
5      $BNI         26
6     $RACE         23
7     $BARB         20
8     $BEAM         19
9      $ARD         17
10     $GMX         16
11    $VELA         14
12    $Prrr         14
13   $LOTTO         11
14    $ONYX          9
15     $100          9
16     $MMT          9
17     $JOE          9
18     $WMX          9
19     $XRP          8
20       $1          8
21     $SRP          8
22    $PRRR          8
23      $FB          7
24     $ZDX          7
25   $MAGIC          7
26     $ARX          7
27     $OVL          7
28     $DPX          6
29     $LKT          6
30     $GRV          6
31    $AFA.          6
32      $ID          6
33     $AFA          6
34     $PBX          5
35     $GNS          5
36      $ZZ          5
37     $PLS          5
38   $PESOS          5
39  $HOBBES          5
40   $darge          5
41     $WOM          5
42    $ARBK