In [99]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
from mtranslate import translate
import nltk
from IPython.display import display
import plotly.express as px
import spacy

In [157]:
def translate_to_english(text):
    translated_text = translate(text, 'en')
    return translated_text if translated_text else text

def translate_cell(cell):
    return translate_to_english(cell) if cell != 'null' else cell

def clean_text(text):
  
    text = re.sub(r'http\S+|www.\S+', '', text) # Remove URLs
    text = text.encode('ascii', 'ignore').decode('ascii')   #Remove emojis
    text = text.lower() # Lowercase the text
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words('english')) # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer() # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if len(word) > 1] # Remove single characters and extra spaces
    clean_text = ' '.join(tokens)

    return clean_text if clean_text else "empty after cleaning"

def clean_dataframe_column(df, column_name):
    df[column_name] = df[column_name].apply(clean_text)
    return df

In [204]:
# df = pd.read_csv('/Users/alinabaciu/Desktop/Social_Web/Notebook/Alina/data_files/dataset_text.csv')
# df['full_name'].replace(['', pd.NA], 'null', inplace=True)
# df['description'].replace(['', pd.NA], 'null', inplace=True)
# df['description'] = df['description'].apply(lambda x: translate_cell(x))
# df['full_name'] = df['full_name'].apply(lambda x: translate_cell(x))

file_path = '/Users/alinabaciu/Desktop/Social_Web/Notebook/Alina/data_files/translated.pkl'
df1 = pd.read_pickle(file_path)
df2 = pd.read_csv('/Users/alinabaciu/Desktop/Social_Web/Notebook/Alina/data_files/dataset_text2.csv')
display(df1.head())
display(df2.head())

Unnamed: 0,username,full_name,description,is_fake
0,reallilscrappy,Lil Scrappy,Booking& management : @RealTaliban 470-343-940...,False
1,alldaytravel,#alldaytravel 🌴,We encourage you to celebrate our beautiful wo...,False
2,ciriljazbec,Cyril Jazbec,National Geographic photographer / based in Sl...,False
3,kickposters,And Freebairn KickPosters,Illustrator | Designer Clients Include: adidas...,False
4,alliemtaylor,Alexandra Taylor,allie@livetheadventure.club Snapchat: allie-ta...,False


Unnamed: 0,following,followers,username_length,full_name_length,description_length,username_has_number,full_name_has_number,description_has_number,is_fake
0,953,206135,9,9,139,False,False,True,False
1,827,138759,9,9,56,False,False,False,False
2,201,1010205,10,17,88,False,False,False,False
3,243,277538,10,4,56,False,False,False,False
4,4,316685,11,12,132,False,False,False,False


In [205]:
dataframe = clean_dataframe_column(df1,'description')
dataframe = clean_dataframe_column(df1,'full_name')
dataframe = clean_dataframe_column(df1,'username')
#dataframe = dataframe[(dataframe['description'].str.strip() != '') & (dataframe['full_name'].str.strip() != '')] # from 1386 to 1295
dataframe.loc[(dataframe['description'].str.strip() == '') | (dataframe['full_name'].str.strip() == ''), ['description', 'full_name']] = 'null'
#dataframe = dataframe[~((dataframe['description'].str.strip() == 'null') & (dataframe['full_name'].str.strip() == 'null'))] # from 1295 to 1020
dataframe['is_fake'] = dataframe['is_fake'].replace({True: 'fake', False: 'real'})
dataframe

Unnamed: 0,username,full_name,description,is_fake
0,reallilscrappy,lil scrappy,booking management realtaliban booklilscrappyg...,real
1,alldaytravel,alldaytravel,encourage celebrate beautiful world best trave...,real
2,ciriljazbec,cyril jazbec,national geographic photographer based sloveni...,real
3,kickposters,freebairn kickposters,illustrator designer client include adidas ree...,real
4,alliemtaylor,alexandra taylor,allielivetheadventureclub snapchat allietaylor...,real
...,...,...,...,...
1381,alipoco,,,fake
1382,golmar,,,fake
1383,sooltan,empty after cleaning,empty after cleaning,fake
1384,sangstone,rock,artificial stone production artificialstone st...,fake


In [206]:
dataframe = dataframe[dataframe['description'] != "empty after cleaning"] # this is run before the description analysis

In [207]:
real_accounts = dataframe[dataframe['is_fake'] == 'real']
length_real = len(real_accounts)
fake_accounts = dataframe[dataframe['is_fake'] == 'fake']
length_fake = len(fake_accounts)

real_accounts_with_null_description = real_accounts[real_accounts['description'] == 'null']
length_real_with_null = len(real_accounts_with_null_description)
real_accounts_with_not_null_description = real_accounts[real_accounts['description'] != 'null']
length_real_without_null = len(real_accounts_with_not_null_description)
fake_accounts_with_null_description = fake_accounts[fake_accounts['description'] == 'null']
length_fake_with_null = len(fake_accounts_with_null_description)
fake_accounts_with_not_null_description = fake_accounts[fake_accounts['description'] != 'null']
length_fake_without_null = len(fake_accounts_with_not_null_description)

percentage_real = length_real/len(dataframe)
percentage_real_with_null = length_real_with_null/length_real
percentage_real_without_null = length_real_without_null/length_real
percentage_fake = length_fake/len(dataframe)
percentage_fake_with_null = length_fake_with_null/length_fake
percentage_fake_without_null = length_fake_without_null/length_fake

# Create a DataFrame for plotting
data = {
    'Account Type': ['Real', 'Real', 'Fake', 'Fake'],
    'Description': ['Not Empty', 'Empty', 'Not Empty', 'Empty'],
    'Percentage': [
        percentage_real_without_null * 100,
        percentage_real_with_null * 100,
        percentage_fake_without_null * 100,
        percentage_fake_with_null * 100
    ]
}
plot_data = pd.DataFrame(data)

# Create a stacked bar chart using Plotly Express
fig = px.bar(plot_data, x='Account Type', y='Percentage', color='Description',
             barmode='stack', labels={'Percentage': 'Percentage (%)'},color_discrete_sequence=px.colors.qualitative.Pastel,text='Percentage',
             title='Description Type for Real and Fake Accounts')
fig.update_traces(texttemplate='%{text:.0f}%', textposition='inside')
fig.update_layout(width=500, height=400)
fig.show()

In [208]:
ner_model = spacy.load("en_core_web_sm") # spacy.cli.download("en_core_web_sm")
def count_entities(text):
    
    doc = ner_model(text)
    entity_count = {'PERSON': 0, 'ORG': 0, 'GPE': 0, 'DATE': 0, 'NORP': 0, 'LOC': 0, 'PRODUCT': 0, 'EVENT': 0, 'WORK_OF_ART': 0, 'LAW': 0, 'LANGUAGE': 0, 'FAC': 0, 'MONEY': 0, 'QUANTITY': 0, 'ORDINAL': 0, 'CARDINAL': 0}
    for ent in doc.ents:
        if ent.label_ in entity_count:
            entity_count[ent.label_] += 1
    
    return entity_count

def add_entity_counts_as_columns(df, text_column):

    entity_counts = df[text_column].apply(count_entities).apply(pd.Series)
    df = pd.concat([df, entity_counts.add_prefix('nr_')], axis=1)
    df = df.loc[:, (df != 0).any(axis=0)]
    
    return df

def get_top_entities(df, text_column):
 
    entity_counts = df[text_column].apply(count_entities)
    org_counts = {}
    gpe_counts = {}

    for index, row in entity_counts.items():
        if 'ORG' in row:
            orgs = df.at[index, text_column]
            if row['ORG'] > 0 and isinstance(orgs, str):
                org_list = [ent.text for ent in ner_model(orgs).ents if ent.label_ == 'ORG']
                for org in org_list:
                    if org in org_counts:
                        org_counts[org] += 1
                    else:
                        org_counts[org] = 1

        if 'GPE' in row:
            gpes = df.at[index, text_column]
            if row['GPE'] > 0 and isinstance(gpes, str):
                gpe_list = [ent.text for ent in ner_model(gpes).ents if ent.label_ == 'GPE']
                for gpe in gpe_list:
                    if gpe in gpe_counts:
                        gpe_counts[gpe] += 1
                    else:
                        gpe_counts[gpe] = 1

   
    top_orgs = sorted(org_counts.items(), key=lambda x: x[1], reverse=True)  # Sort entities by count
    top_gpes = sorted(gpe_counts.items(), key=lambda x: x[1], reverse=True)

    return top_orgs, top_gpes

In [209]:
df_description = add_entity_counts_as_columns(dataframe, 'description')
df_description.head()

Unnamed: 0,username,full_name,description,is_fake,nr_PERSON,nr_ORG,nr_GPE,nr_DATE,nr_NORP,nr_LOC,nr_PRODUCT,nr_WORK_OF_ART,nr_LANGUAGE,nr_FAC,nr_ORDINAL,nr_CARDINAL
0,reallilscrappy,lil scrappy,booking management realtaliban booklilscrappyg...,real,2,0,0,0,0,0,0,0,0,0,0,0
1,alldaytravel,alldaytravel,encourage celebrate beautiful world best trave...,real,0,0,0,0,0,0,0,0,0,0,0,0
2,ciriljazbec,cyril jazbec,national geographic photographer based sloveni...,real,0,1,0,0,0,0,0,0,0,0,0,0
3,kickposters,freebairn kickposters,illustrator designer client include adidas ree...,real,1,0,0,0,0,0,0,0,0,0,0,0
4,alliemtaylor,alexandra taylor,allielivetheadventureclub snapchat allietaylor...,real,0,0,0,0,0,0,0,0,0,0,0,0


In [210]:
nr_columns = [col for col in df_description.columns if col.startswith('nr_')]
sum_by_fake = df_description.groupby('is_fake')[nr_columns].sum().reset_index()
melted = pd.melt(sum_by_fake, id_vars='is_fake', var_name='Column', value_name='Total Value')
fig = px.bar(melted, x='Column', y='Total Value', color='is_fake', barmode='stack',color_discrete_sequence=px.colors.qualitative.Vivid,text = 'Total Value',
             title='Total number of entities per Account Type - Description', labels={'Column': 'Different entities', 'Total Value': 'Total Value'})

fig.show()

In [172]:
dataframe = dataframe[dataframe['full_name'] != "empty after cleaning"] # this is run before the description analysis

In [173]:
real_accounts = dataframe[dataframe['is_fake'] == 'real']
length_real = len(real_accounts)
fake_accounts = dataframe[dataframe['is_fake'] == 'fake']
length_fake = len(fake_accounts)

real_accounts_with_null_full_name = real_accounts[real_accounts['full_name'] == 'null']
length_real_with_null_full_name = len(real_accounts_with_null_full_name)
real_accounts_with_not_null_full_name = real_accounts[real_accounts['full_name'] != 'null']
length_real_without_null_full_name = len(real_accounts_with_not_null_full_name)
fake_accounts_with_null_full_name = fake_accounts[fake_accounts['full_name'] == 'null']
length_fake_with_null_full_name = len(fake_accounts_with_null_full_name)
fake_accounts_with_not_null_full_name = fake_accounts[fake_accounts['full_name'] != 'null']
length_fake_without_null_full_name = len(fake_accounts_with_not_null_full_name)

percentage_real = length_real/len(dataframe)
percentage_real_with_null_full_name = length_real_with_null_full_name/length_real
percentage_real_without_null_full_name = length_real_without_null_full_name/length_real
percentage_fake = length_fake/len(dataframe)
percentage_fake_with_null_full_name = length_fake_with_null_full_name/length_fake
percentage_fake_without_null_full_name = length_fake_without_null_full_name/length_fake

# Create a DataFrame for plotting
data = {
    'Account Type': ['Real', 'Real', 'Fake', 'Fake'],
    'Full_Name': ['Not Empty', 'Empty', 'Not Empty', 'Empty'],
    'Percentage': [
        percentage_real_without_null_full_name * 100,
        percentage_real_with_null_full_name * 100,
        percentage_fake_without_null_full_name * 100,
        percentage_fake_with_null_full_name * 100
    ]
}
plot_data = pd.DataFrame(data)

# Create a stacked bar chart using Plotly Express
fig = px.bar(plot_data, x='Account Type', y='Percentage', color='Full_Name',
             barmode='stack', labels={'Percentage': 'Percentage (%)'},color_discrete_sequence=px.colors.qualitative.Pastel,text='Percentage',
             title='Full_Name Type for Real and Fake Accounts')
fig.update_traces(texttemplate='%{text:.0f}%', textposition='inside')
fig.update_layout(width=500, height=400)
fig.show()

In [174]:
df_fullname = add_entity_counts_as_columns(dataframe, 'full_name')
df_fullname = df_fullname.drop(columns=['nr_PERSON'])
df_fullname.head()

Unnamed: 0,username,full_name,description,is_fake,nr_ORG,nr_GPE,nr_DATE,nr_NORP,nr_LOC,nr_PRODUCT,nr_WORK_OF_ART,nr_ORDINAL,nr_CARDINAL
0,reallilscrappy,lil scrappy,booking management realtaliban booklilscrappyg...,real,0,0,0,0,0,0,0,0,0
1,alldaytravel,alldaytravel,encourage celebrate beautiful world best trave...,real,0,0,0,0,0,0,0,0,0
2,ciriljazbec,cyril jazbec,national geographic photographer based sloveni...,real,0,0,0,0,0,0,0,0,0
3,kickposters,freebairn kickposters,illustrator designer client include adidas ree...,real,1,0,0,0,0,0,0,0,0
4,alliemtaylor,alexandra taylor,allielivetheadventureclub snapchat allietaylor...,real,0,0,0,0,0,0,0,0,0


In [175]:
nr_columns = [col for col in df_fullname.columns if col.startswith('nr_')]
sum_by_fake = df_fullname.groupby('is_fake')[nr_columns].sum().reset_index()
melted = pd.melt(sum_by_fake, id_vars='is_fake', var_name='Column', value_name='Total Value')
fig = px.bar(melted, x='Column', y='Total Value', color='is_fake', barmode='stack',color_discrete_sequence=px.colors.qualitative.Vivid,text = 'Total Value',
             title='Total number of entities per Account Type - Full_Name', labels={'Column': 'Different entities', 'Total Value': 'Total Value'})

fig.show()

In [203]:
dataframe = dataframe[dataframe['username'] != "empty after cleaning"] 
dataframe.shape

(1368, 4)

In [199]:
df_username= add_entity_counts_as_columns(dataframe, 'username')
df_username = df_username.drop(columns=['nr_PERSON'])
df_username.head()

Unnamed: 0,username,full_name,description,is_fake,nr_ORG,nr_GPE,nr_NORP,nr_CARDINAL
0,reallilscrappy,lil scrappy,booking management realtaliban booklilscrappyg...,real,0,0,0,0
1,alldaytravel,alldaytravel,encourage celebrate beautiful world best trave...,real,0,0,0,0
2,ciriljazbec,cyril jazbec,national geographic photographer based sloveni...,real,0,0,0,0
3,kickposters,freebairn kickposters,illustrator designer client include adidas ree...,real,0,0,0,0
4,alliemtaylor,alexandra taylor,allielivetheadventureclub snapchat allietaylor...,real,0,0,0,0


In [200]:
nr_columns = [col for col in df_username.columns if col.startswith('nr_')]
sum_by_fake = df_username.groupby('is_fake')[nr_columns].sum().reset_index()
melted = pd.melt(sum_by_fake, id_vars='is_fake', var_name='Column', value_name='Total Value')
fig = px.bar(melted, x='Column', y='Total Value', color='is_fake', barmode='stack',color_discrete_sequence=px.colors.qualitative.Vivid,text = 'Total Value',
             title='Total number of entities per Account Type - Username', labels={'Column': 'Different entities', 'Total Value': 'Total Value'})
fig.update_layout(width=500, height=400)
fig.show()