In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
from mtranslate import translate
import nltk
from IPython.display import display
import plotly.express as px
import spacy
import numpy as np

In [2]:
def translate_to_english(text):
    translated_text = translate(text, 'en')
    return translated_text if translated_text else text

def translate_cell(cell):
    return translate_to_english(cell) if cell != 'null' else cell

def clean_text(text):
  
    text = re.sub(r'http\S+|www.\S+', '', text) # Remove URLs
    text = text.encode('ascii', 'ignore').decode('ascii')   #Remove emojis
    text = text.lower() # Lowercase the text
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words('english')) # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer() # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = list(set(tokens))
    tokens = [word for word in tokens if len(word) > 1] # Remove single characters and extra spaces
    clean_text = ' '.join(tokens)

    return clean_text if clean_text  else  "empty after cleaning"

def clean_dataframe_column(df, column_name):
    df[column_name] = df[column_name].apply(clean_text)
    return df

In [3]:
# df = pd.read_csv('/Users/alinabaciu/Desktop/Social_Web/Notebook/Alina/data_files/dataset_text.csv')
# df['full_name'].replace(['', pd.NA], 'null', inplace=True)
# df['description'].replace(['', pd.NA], 'null', inplace=True)
# df['description'] = df['description'].apply(lambda x: translate_cell(x))
# df['full_name'] = df['full_name'].apply(lambda x: translate_cell(x))

file_path = '/Users/alinabaciu/Desktop/Social_Web/Notebook/Alina/data_files/text_dataset.csv'
#df1 = pd.read_pickle(file_path)
df1 = pd.read_csv(file_path)
df2 = pd.read_csv('/Users/alinabaciu/Desktop/Social_Web/Notebook/Alina/data_files/dataset_text2.csv')
display(df1.head())
display(df2.head())

Unnamed: 0,username,full_name,description,is_fake
0,linda_lomelino,Linda Lomelino,Photographer & cookbook author | Sweden | lind...,False
1,angelicablick,Angelica Blick,🌍 Fashion- & travelblogger from Sweden 🇸🇪 🌴Cur...,False
2,giulianafortuna,GF,giuliana.fortuna@icloud.com,False
3,kerriehessillustration,Kerrie Hess Illustrator,Australian Artist who has illustrated for Cart...,False
4,renan_ozturk,Renan Ozturk,@Camp4Collective Filmmaker // @TheNorthFace Cl...,False


Unnamed: 0,following,followers,username_length,full_name_length,description_length,username_has_number,full_name_has_number,description_has_number,is_fake
0,953,206135,9,9,139,False,False,True,False
1,827,138759,9,9,56,False,False,False,False
2,201,1010205,10,17,88,False,False,False,False
3,243,277538,10,4,56,False,False,False,False
4,4,316685,11,12,132,False,False,False,False


In [4]:
df1['is_fake'] = df1['is_fake'].replace({True: 'fake', False: 'real'})
real_accounts = df1[df1['is_fake'] == 'real']
length_real = len(real_accounts)
fake_accounts = df1[df1['is_fake'] == 'fake']
length_fake = len(fake_accounts)

real_accounts_with_null_description = real_accounts[real_accounts['description'] == 'null']
length_real_with_null = len(real_accounts_with_null_description)
real_accounts_with_not_null_description = real_accounts[real_accounts['description'] != 'null']
length_real_without_null = len(real_accounts_with_not_null_description)
fake_accounts_with_null_description = fake_accounts[fake_accounts['description'] == 'null']
length_fake_with_null = len(fake_accounts_with_null_description)
fake_accounts_with_not_null_description = fake_accounts[fake_accounts['description'] != 'null']
length_fake_without_null = len(fake_accounts_with_not_null_description)

percentage_real = length_real/len(df1)
percentage_real_with_null = length_real_with_null/length_real
percentage_real_without_null = length_real_without_null/length_real
percentage_fake = length_fake/len(df1)
percentage_fake_with_null = length_fake_with_null/length_fake
percentage_fake_without_null = length_fake_without_null/length_fake

# Create a DataFrame for plotting
data = {
    'Account Type': ['Real', 'Real', 'Fake', 'Fake'],
    'Description': ['Not Empty', 'Empty', 'Not Empty', 'Empty'],
    'Percentage': [
        percentage_real_without_null * 100,
        percentage_real_with_null * 100,
        percentage_fake_without_null * 100,
        percentage_fake_with_null * 100
    ]
}
plot_data = pd.DataFrame(data)

# Create a stacked bar chart using Plotly Express
fig = px.bar(plot_data, x='Account Type', y='Percentage', color='Description',
             barmode='stack', labels={'Percentage': 'Percentage (%)'},color_discrete_sequence=px.colors.qualitative.Pastel,text='Percentage',
             title='Description Type for Accounts - Before Cleaning')
fig.update_traces(texttemplate='%{text:.0f}%', textposition='inside')
fig.update_layout(width=500, height=400)
fig.show()

In [5]:
df1['is_fake'] = df1['is_fake'].replace({True: 'fake', False: 'real'})
real_accounts = df1[df1['is_fake'] == 'real']
length_real = len(real_accounts)
fake_accounts = df1[df1['is_fake'] == 'fake']
length_fake = len(fake_accounts)

real_accounts_with_null_full_name = real_accounts[real_accounts['full_name'] == 'null']
length_real_with_null_full_name = len(real_accounts_with_null_full_name)
real_accounts_with_not_null_full_name = real_accounts[real_accounts['full_name'] != 'null']
length_real_without_null_full_name = len(real_accounts_with_not_null_full_name)
fake_accounts_with_null_full_name = fake_accounts[fake_accounts['full_name'] == 'null']
length_fake_with_null_full_name = len(fake_accounts_with_null_full_name)
fake_accounts_with_not_null_full_name = fake_accounts[fake_accounts['full_name'] != 'null']
length_fake_without_null_full_name = len(fake_accounts_with_not_null_full_name)

percentage_real = length_real/len(df1)
percentage_real_with_null_full_name = length_real_with_null_full_name/length_real
percentage_real_without_null_full_name = length_real_without_null_full_name/length_real
percentage_fake = length_fake/len(df1)
percentage_fake_with_null_full_name = length_fake_with_null_full_name/length_fake
percentage_fake_without_null_full_name = length_fake_without_null_full_name/length_fake

# Create a DataFrame for plotting
data = {
    'Account Type': ['Real', 'Real', 'Fake', 'Fake'],
    'Full_Name': ['Not Empty', 'Empty', 'Not Empty', 'Empty'],
    'Percentage': [
        percentage_real_without_null_full_name * 100,
        percentage_real_with_null_full_name * 100,
        percentage_fake_without_null_full_name * 100,
        percentage_fake_with_null_full_name * 100
    ]
}
plot_data = pd.DataFrame(data)

# Create a stacked bar chart using Plotly Express
fig = px.bar(plot_data, x='Account Type', y='Percentage', color='Full_Name',
             barmode='stack', labels={'Percentage': 'Percentage (%)'},color_discrete_sequence=px.colors.qualitative.Pastel,text='Percentage',
             title='Full_Name Type for Real and Fake Accounts - Before Cleaning')
fig.update_traces(texttemplate='%{text:.0f}%', textposition='inside')
fig.update_layout(width=500, height=400)
fig.show()

In [6]:
df1['is_fake'] = df1['is_fake'].replace({True: 'fake', False: 'real'})
real_accounts = df1[df1['is_fake'] == 'real']
length_real = len(real_accounts)
fake_accounts = df1[df1['is_fake'] == 'fake']
length_fake = len(fake_accounts)

real_accounts_with_null_username = real_accounts[real_accounts['username'] == 'empty after cleaning']
length_real_with_null_username = len(real_accounts_with_null_username)
real_accounts_with_not_null_username = real_accounts[real_accounts['username'] != 'empty after cleaning']
length_real_without_null_username = len(real_accounts_with_not_null_username)
fake_accounts_with_null_username = fake_accounts[fake_accounts['username'] == 'empty after cleaning']
length_fake_with_null_username = len(fake_accounts_with_null_username)
fake_accounts_with_not_null_username = fake_accounts[fake_accounts['username'] != 'empty after cleaning']
length_fake_without_null_username = len(fake_accounts_with_not_null_username)

percentage_real = length_real/len(df1)
percentage_real_with_null_username = length_real_with_null_username/length_real
percentage_real_without_null_username = length_real_without_null_username/length_real
percentage_fake = length_fake/len(df1)
percentage_fake_with_null_username = length_fake_with_null_username/length_fake
percentage_fake_without_null_username= length_fake_without_null_username/length_fake

# Create a DataFrame for plotting
data = {
    'Account Type': ['Real', 'Real', 'Fake', 'Fake'],
    'Full_Name': ['Not Empty', 'Empty', 'Not Empty', 'Empty'],
    'Percentage': [
        percentage_real_without_null_username * 100,
        percentage_real_with_null_username * 100,
        percentage_fake_without_null_username * 100,
        percentage_fake_with_null_username * 100
    ]
}
plot_data = pd.DataFrame(data)

# Create a stacked bar chart using Plotly Express
fig = px.bar(plot_data, x='Account Type', y='Percentage', color='Full_Name',
             barmode='stack', labels={'Percentage': 'Percentage (%)'},color_discrete_sequence=px.colors.qualitative.Pastel,text='Percentage',
             title='Username Type for Accounts - Before Cleaning')
fig.update_traces(texttemplate='%{text:.0f}%', textposition='inside')
fig.update_layout(width=500, height=400)
fig.show()

In [7]:
df1['description'] = df1['description'].astype(str)
df1['full_name'] = df1['full_name'].astype(str)
df1['username'] = df1['username'].astype(str)
dataframe = clean_dataframe_column(df1,'description')
dataframe = clean_dataframe_column(df1,'full_name')
dataframe = clean_dataframe_column(df1,'username')
#dataframe = dataframe[(dataframe['description'].str.strip() != '') & (dataframe['full_name'].str.strip() != '')] # from 1386 to 1295
dataframe.loc[(dataframe['description'].str.strip() == '') | (dataframe['full_name'].str.strip() == ''), ['description', 'full_name']] = 'null'
#dataframe = dataframe[~((dataframe['description'].str.strip() == 'null') & (dataframe['full_name'].str.strip() == 'null'))] # from 1295 to 1020
dataframe['is_fake'] = dataframe['is_fake'].replace({True: 'fake', False: 'real'})
dataframe

Unnamed: 0,username,full_name,description,is_fake
0,lindalomelino,lomelino linda,author photographer sweden printsandpropsse sh...,real
1,angelicablick,angelica blick,sweden angelicablicklivese youtube la snapchat...,real
2,giulianafortuna,gf,giulianafortunaicloudcom,real
3,kerriehessillustration,illustrator hess kerrie,artist louis lancome laduree elie paris austra...,real
4,renanozturk,renan ozturk,campcollective natgeo thenorthface photographe...,real
...,...,...,...,...
529,amirrezaalmas,mizm,mahi boy kermanshah dey af diamond,fake
530,alipvpage,ali barahimi,love clutch playing perspolis esfahan live,fake
531,mahdiyeh,empty after cleaning,empty after cleaning,fake
532,fashionwnderfully,empty after cleaning,idea follow style attractive,fake


In [109]:
dataframe = dataframe[dataframe['description'] != "empty after cleaning"] # this is run before the description analysis

In [49]:
real_accounts = dataframe[dataframe['is_fake'] == 'real']
length_real = len(real_accounts)
fake_accounts = dataframe[dataframe['is_fake'] == 'fake']
length_fake = len(fake_accounts)

real_accounts_with_null_description = real_accounts[real_accounts['description'] == 'null']
length_real_with_null = len(real_accounts_with_null_description)
real_accounts_with_not_null_description = real_accounts[real_accounts['description'] != 'null']
length_real_without_null = len(real_accounts_with_not_null_description)
fake_accounts_with_null_description = fake_accounts[fake_accounts['description'] == 'null']
length_fake_with_null = len(fake_accounts_with_null_description)
fake_accounts_with_not_null_description = fake_accounts[fake_accounts['description'] != 'null']
length_fake_without_null = len(fake_accounts_with_not_null_description)

percentage_real = length_real/len(dataframe)
percentage_real_with_null = length_real_with_null/length_real
percentage_real_without_null = length_real_without_null/length_real
percentage_fake = length_fake/len(dataframe)
percentage_fake_with_null = length_fake_with_null/length_fake
percentage_fake_without_null = length_fake_without_null/length_fake

# Create a DataFrame for plotting
data = {
    'Account Type': ['Real', 'Real', 'Fake', 'Fake'],
    'Description': ['Not Empty', 'Empty', 'Not Empty', 'Empty'],
    'Percentage': [
        percentage_real_without_null * 100,
        percentage_real_with_null * 100,
        percentage_fake_without_null * 100,
        percentage_fake_with_null * 100
    ]
}
plot_data = pd.DataFrame(data)

# Create a stacked bar chart using Plotly Express
fig = px.bar(plot_data, x='Account Type', y='Percentage', color='Description',
             barmode='stack', labels={'Percentage': 'Percentage (%)'},color_discrete_sequence=px.colors.qualitative.Pastel,text='Percentage',
             title='Description Type for Account Types - After cleaning')
fig.update_traces(texttemplate='%{text:.0f}%', textposition='inside')
fig.update_layout(width=500, height=400)
fig.show()

In [23]:
ner_model = spacy.load("en_core_web_sm") # spacy.cli.download("en_core_web_sm")
def count_entities(text):
    
    doc = ner_model(text)
    entity_count = {'PERSON': 0, 'ORG': 0, 'GPE': 0, 'DATE': 0, 'NORP': 0, 'LOC': 0, 'PRODUCT': 0, 'EVENT': 0, 'WORK_OF_ART': 0, 'LAW': 0, 'LANGUAGE': 0, 'FAC': 0, 'MONEY': 0, 'QUANTITY': 0, 'ORDINAL': 0, 'CARDINAL': 0}
    for ent in doc.ents:
        if ent.label_ in entity_count:
            entity_count[ent.label_] += 1
    
    return entity_count

def add_entity_counts_as_columns(df, text_column):

    entity_counts = df[text_column].apply(count_entities).apply(pd.Series)
    df = pd.concat([df, entity_counts.add_prefix('nr_')], axis=1)
    df = df.loc[:, (df != 0).any(axis=0)]
    
    return df

def get_top_entities(df, text_column):
 
    entity_counts = df[text_column].apply(count_entities)
    org_counts = {}
    gpe_counts = {}

    for index, row in entity_counts.items():
        if 'ORG' in row:
            orgs = df.at[index, text_column]
            if row['ORG'] > 0 and isinstance(orgs, str):
                org_list = [ent.text for ent in ner_model(orgs).ents if ent.label_ == 'ORG']
                for org in org_list:
                    if org in org_counts:
                        org_counts[org] += 1
                    else:
                        org_counts[org] = 1

        if 'GPE' in row:
            gpes = df.at[index, text_column]
            if row['GPE'] > 0 and isinstance(gpes, str):
                gpe_list = [ent.text for ent in ner_model(gpes).ents if ent.label_ == 'GPE']
                for gpe in gpe_list:
                    if gpe in gpe_counts:
                        gpe_counts[gpe] += 1
                    else:
                        gpe_counts[gpe] = 1

   
    top_orgs = sorted(org_counts.items(), key=lambda x: x[1], reverse=True)  # Sort entities by count
    top_gpes = sorted(gpe_counts.items(), key=lambda x: x[1], reverse=True)

    return top_orgs, top_gpes

In [110]:
df_description = add_entity_counts_as_columns(dataframe, 'description')
df_description.head()

Unnamed: 0,username,full_name,description,is_fake,nr_PERSON,nr_ORG,nr_GPE,nr_DATE,nr_NORP,nr_LOC,nr_PRODUCT,nr_ORDINAL,nr_CARDINAL
0,lindalomelino,lomelino linda,cookbook author printsandpropsse photographer ...,real,0,0,0,0,1,0,0,0,0
1,angelicablick,angelica blick,la angelicablick angelicablicklivese currently...,real,1,0,0,0,1,0,0,0,0
2,giulianafortuna,gf,giulianafortunaicloudcom,real,0,0,0,0,0,0,0,0,0
3,kerriehessillustration,illustrator hess kerrie,louis cartier vuitton printemps saab art artis...,real,2,2,0,0,1,0,0,0,0
4,renanozturk,ozturk renan,natgeo thenorthface photographer climber heart...,real,1,0,0,0,0,0,0,0,0


In [35]:
# Assuming df_description contains your dataset

# Separate data for real and fake descriptions
real_descriptions = df_description[df_description['is_fake'] == 'real']['nr_PERSON']
fake_descriptions = df_description[df_description['is_fake'] == 'fake']['nr_PERSON']

# Get the sum of NR_PERSON for real and fake descriptions
sum_real_descriptions = real_descriptions.var()
sum_fake_descriptions = fake_descriptions.var()

print("Sum of NR_PERSON for real descriptions:", sum_real_descriptions)
print("Sum of NR_PERSON for fake descriptions:", sum_fake_descriptions)


Sum of NR_PERSON for real descriptions: 0.18782108960032448
Sum of NR_PERSON for fake descriptions: 0.1270349143841631


In [111]:
from scipy import stats
import pandas as pd

# Assuming df_description contains your dataset

# Separate data for real and fake descriptions
real_descriptions = df_description[df_description['is_fake'] == 'real']['nr_PERSON']
fake_descriptions = df_description[df_description['is_fake'] == 'fake']['nr_PERSON']

# Perform one-tailed two-sample t-test assuming unequal variances
t_stat, p_value = stats.ttest_ind(real_descriptions, fake_descriptions, equal_var=False)

# Display the results
print("T-statistic:", t_stat)
print("P-value:", p_value / 2)  # Division by 2 for a one-tailed test

alpha = 0.05  # Set your desired significance level
if (p_value / 2) < alpha and t_stat > 0:
    print("Reject the null hypothesis: The mean of real dataset is higher than fake dataset.")
else:
    print("Fail to reject the null hypothesis: The mean of real dataset is not higher than fake dataset.")

T-statistic: 2.691929574266145
P-value: 0.003672252379742676
Reject the null hypothesis: The mean of real dataset is higher than fake dataset.


In [112]:
from scipy import stats
import pandas as pd

# Assuming df_description contains your dataset

# Separate data for real and fake descriptions
real_descriptions = df_description[df_description['is_fake'] == 'real']['nr_ORG']
fake_descriptions = df_description[df_description['is_fake'] == 'fake']['nr_ORG']

# Perform one-tailed two-sample t-test assuming unequal variances
t_stat, p_value = stats.ttest_ind(real_descriptions, fake_descriptions, equal_var=False)

# Display the results
print("T-statistic:", t_stat)
print("P-value:", p_value / 2)  # Division by 2 for a one-tailed test

alpha = 0.05  # Set your desired significance level
if (p_value / 2) < alpha and t_stat > 0:
    print("Reject the null hypothesis: The mean of real dataset is higher than fake dataset.")
else:
    print("Fail to reject the null hypothesis: The mean of real dataset is not higher than fake dataset.")

T-statistic: 3.9763979497902624
P-value: 4.1779262076971244e-05
Reject the null hypothesis: The mean of real dataset is higher than fake dataset.


In [113]:
from scipy import stats
import pandas as pd

# Assuming df_description contains your dataset

# Separate data for real and fake descriptions
real_descriptions = df_description[df_description['is_fake'] == 'real']['nr_NORP']
fake_descriptions = df_description[df_description['is_fake'] == 'fake']['nr_NORP']

# Perform one-tailed two-sample t-test assuming unequal variances
t_stat, p_value = stats.ttest_ind(real_descriptions, fake_descriptions, equal_var=False)

# Display the results
print("T-statistic:", t_stat)
print("P-value:", p_value / 2)  # Division by 2 for a one-tailed test

alpha = 0.05  # Set your desired significance level
if (p_value / 2) < alpha and t_stat > 0:
    print("Reject the null hypothesis: The mean of real dataset is higher than fake dataset.")
else:
    print("Fail to reject the null hypothesis: The mean of real dataset is not higher than fake dataset.")

T-statistic: 0.8986795117494014
P-value: 0.18462972000278155
Fail to reject the null hypothesis: The mean of real dataset is not higher than fake dataset.


In [114]:
from scipy import stats
import pandas as pd

# Assuming df_description contains your dataset

# Separate data for real and fake descriptions
real_descriptions = df_description[df_description['is_fake'] == 'real']['nr_GPE']
fake_descriptions = df_description[df_description['is_fake'] == 'fake']['nr_GPE']

# Perform one-tailed two-sample t-test assuming unequal variances
t_stat, p_value = stats.ttest_ind(real_descriptions, fake_descriptions, equal_var=False)

# Display the results
print("T-statistic:", t_stat)
print("P-value:", p_value / 2)  # Division by 2 for a one-tailed test

alpha = 0.05  # Set your desired significance level
if (p_value / 2) < alpha and t_stat > 0:
    print("Reject the null hypothesis: The mean of real dataset is higher than fake dataset.")
else:
    print("Fail to reject the null hypothesis: The mean of real dataset is not higher than fake dataset.")

T-statistic: 2.688816702320401
P-value: 0.003711878783775548
Reject the null hypothesis: The mean of real dataset is higher than fake dataset.


In [115]:
from scipy import stats
import pandas as pd

# Assuming df_description contains your dataset

# Separate data for real and fake descriptions
real_descriptions = df_description[df_description['is_fake'] == 'real']['nr_CARDINAL']
fake_descriptions = df_description[df_description['is_fake'] == 'fake']['nr_CARDINAL']

# Perform one-tailed two-sample t-test assuming unequal variances
t_stat, p_value = stats.ttest_ind(real_descriptions, fake_descriptions, equal_var=False)

# Display the results
print("T-statistic:", t_stat)
print("P-value:", p_value / 2)  # Division by 2 for a one-tailed test

alpha = 0.05  # Set your desired significance level
if (p_value / 2) < alpha and t_stat < 0:  # Reversed comparison logic for fake mean being higher
    print("Reject the null hypothesis: The mean of fake dataset is higher than real dataset.")
else:
    print("Fail to reject the null hypothesis: The mean of fake dataset is not higher than real dataset.")


T-statistic: -0.8372957777151125
P-value: 0.20144229973738242
Fail to reject the null hypothesis: The mean of fake dataset is not higher than real dataset.


In [116]:
nr_columns = [col for col in df_description.columns if col.startswith('nr_')]
sum_by_fake = df_description.groupby('is_fake')[nr_columns].sum().reset_index()
melted = pd.melt(sum_by_fake, id_vars='is_fake', var_name='Column', value_name='Total Value')
fig = px.bar(melted, x='Column', y='Total Value', color='is_fake', barmode='stack',color_discrete_sequence=px.colors.qualitative.Vivid,text = 'Total Value',
             title='Total number of entities per Account Type - Description', labels={'Column': 'Different entities', 'Total Value': 'Total Value'})

fig.show()

In [117]:
from collections import defaultdict

def get_entities_for_fake_accounts(df, text_column, is_fake_column):
    grouped_entities = defaultdict(list)

    for index, row in df[df[is_fake_column] == 'fake'].iterrows():
        doc = ner_model(row[text_column])
        for ent in doc.ents:
            grouped_entities[ent.label_].append(ent.text)

    sorted_entities = {label: sorted(entities) for label, entities in grouped_entities.items()}
    return sorted_entities

fake_entities = get_entities_for_fake_accounts(dataframe, 'description', 'is_fake')

# Print sorted entities by label
for label, entities in sorted(fake_entities.items()):
    print(f"{label}: {entities}")

CARDINAL: ['equestrian', 'four', 'one', 'one', 'tatality']
DATE: ['autumn mahi th day seventh', 'month', 'month', 'spring', 'today', 'today', 'year']
GPE: ['georgia', 'hhhh', 'iran', 'iran', 'iran', 'iran', 'iran', 'khodamamanambabamdadasham', 'khordad', 'khozestan', 'khuda', 'khuzestan', 'khuzestan', 'la', 'los angeles', 'peru', 'qlbmi', 'razi', 'rome', 'tina', 'turkey']
LOC: ['nova']
NORP: ['iranian', 'islamic', 'islamic', 'islamic', 'javaheri', 'khorazian', 'kurdish', 'saeidkiarsian']
ORDINAL: ['first', 'first']
ORG: ['detail office', 'ge', 'keifiat', 'life im saram', 'lover narmak news', 'mazda', 'nouri civil hassan thanks hassansaboori', 'quotyouquot excuse person', 'shahrekord', 'telegram page', 'text karma guitar biomedical', 'world muzic tehran singel']
PERSON: ['az dar mirm', 'azerbaijan turkiye', 'bazi loy', 'borasi gamlidunya', 'botox collagen', 'elam afsayd mikone', 'farzad farzin farzadfarzin person fan', 'god', 'gunler janimi saldi falk', 'hairfashionaccessory karaj beaut

In [70]:
real_accounts = dataframe[dataframe['is_fake'] == 'real']
length_real = len(real_accounts)
fake_accounts = dataframe[dataframe['is_fake'] == 'fake']
length_fake = len(fake_accounts)

real_accounts_with_null_full_name = real_accounts[real_accounts['full_name'] == 'null']
length_real_with_null_full_name = len(real_accounts_with_null_full_name)
real_accounts_with_not_null_full_name = real_accounts[real_accounts['full_name'] != 'null']
length_real_without_null_full_name = len(real_accounts_with_not_null_full_name)
fake_accounts_with_null_full_name = fake_accounts[fake_accounts['full_name'] == 'null']
length_fake_with_null_full_name = len(fake_accounts_with_null_full_name)
fake_accounts_with_not_null_full_name = fake_accounts[fake_accounts['full_name'] != 'null']
length_fake_without_null_full_name = len(fake_accounts_with_not_null_full_name)

percentage_real = length_real/len(dataframe)
percentage_real_with_null_full_name = length_real_with_null_full_name/length_real
percentage_real_without_null_full_name = length_real_without_null_full_name/length_real
percentage_fake = length_fake/len(dataframe)
percentage_fake_with_null_full_name = length_fake_with_null_full_name/length_fake
percentage_fake_without_null_full_name = length_fake_without_null_full_name/length_fake

# Create a DataFrame for plotting
data = {
    'Account Type': ['Real', 'Real', 'Fake', 'Fake'],
    'Full_Name': ['Not Empty', 'Empty', 'Not Empty', 'Empty'],
    'Percentage': [
        percentage_real_without_null_full_name * 100,
        percentage_real_with_null_full_name * 100,
        percentage_fake_without_null_full_name * 100,
        percentage_fake_with_null_full_name * 100
    ]
}
plot_data = pd.DataFrame(data)

# Create a stacked bar chart using Plotly Express
fig = px.bar(plot_data, x='Account Type', y='Percentage', color='Full_Name',
             barmode='stack', labels={'Percentage': 'Percentage (%)'},color_discrete_sequence=px.colors.qualitative.Pastel,text='Percentage',
             title='Full_Name Type for Real and Fake Accounts')
fig.update_traces(texttemplate='%{text:.0f}%', textposition='inside')
fig.update_layout(width=500, height=400)
fig.show()

In [120]:
dataframe = dataframe[dataframe['full_name'] != "empty after cleaning"]

In [121]:
df_fullname = add_entity_counts_as_columns(dataframe, 'full_name')
df_fullname = df_fullname.drop(columns=['nr_PERSON'])
df_fullname.head()

Unnamed: 0,username,full_name,description,is_fake,nr_ORG,nr_GPE,nr_DATE,nr_NORP,nr_WORK_OF_ART
0,lindalomelino,lomelino linda,cookbook author printsandpropsse photographer ...,real,0,0,0,0,0
1,angelicablick,angelica blick,la angelicablick angelicablicklivese currently...,real,0,1,0,0,0
2,giulianafortuna,gf,giulianafortunaicloudcom,real,0,0,0,0,0
3,kerriehessillustration,illustrator hess kerrie,louis cartier vuitton printemps saab art artis...,real,0,0,0,0,0
4,renanozturk,ozturk renan,natgeo thenorthface photographer climber heart...,real,0,1,0,0,0


In [122]:
nr_columns = [col for col in df_fullname.columns if col.startswith('nr_')]
sum_by_fake = df_fullname.groupby('is_fake')[nr_columns].sum().reset_index()
melted = pd.melt(sum_by_fake, id_vars='is_fake', var_name='Column', value_name='Total Value')
fig = px.bar(melted, x='Column', y='Total Value', color='is_fake', barmode='stack',color_discrete_sequence=px.colors.qualitative.Vivid,text = 'Total Value',
             title='Total number of entities per Account Type - Full_Name', labels={'Column': 'Different entities', 'Total Value': 'Total Value'})

fig.show()

In [123]:
from scipy import stats
import pandas as pd

# Assuming df_description contains your dataset

# Separate data for real and fake descriptions
real_descriptions = df_fullname[df_fullname['is_fake'] == 'real']['nr_GPE']
fake_descriptions = df_fullname[df_fullname['is_fake'] == 'fake']['nr_GPE']

# Perform one-tailed two-sample t-test assuming unequal variances
t_stat, p_value = stats.ttest_ind(real_descriptions, fake_descriptions, equal_var=False)

# Display the results
print("T-statistic:", t_stat)
print("P-value:", p_value / 2)  # Division by 2 for a one-tailed test

alpha = 0.05  # Set your desired significance level
if (p_value / 2) < alpha and t_stat > 0:
    print("Reject the null hypothesis: The mean of real dataset is higher than fake dataset.")
else:
    print("Fail to reject the null hypothesis: The mean of real dataset is not higher than fake dataset.")

T-statistic: 1.7121248655557628
P-value: 0.043794079823195345
Reject the null hypothesis: The mean of real dataset is higher than fake dataset.


In [124]:
from scipy import stats
import pandas as pd

# Assuming df_description contains your dataset

# Separate data for real and fake descriptions
real_descriptions = df_fullname[df_fullname['is_fake'] == 'real']['nr_NORP']
fake_descriptions = df_fullname[df_fullname['is_fake'] == 'fake']['nr_NORP']

# Perform one-tailed two-sample t-test assuming unequal variances
t_stat, p_value = stats.ttest_ind(real_descriptions, fake_descriptions, equal_var=False)

# Display the results
print("T-statistic:", t_stat)
print("P-value:", p_value / 2)  # Division by 2 for a one-tailed test

alpha = 0.05  # Set your desired significance level
if (p_value / 2) < alpha and t_stat > 0:
    print("Reject the null hypothesis: The mean of real dataset is higher than fake dataset.")
else:
    print("Fail to reject the null hypothesis: The mean of real dataset is not higher than fake dataset.")

T-statistic: 1.308055902861127
P-value: 0.09577182980627762
Fail to reject the null hypothesis: The mean of real dataset is not higher than fake dataset.


In [125]:
fake_entities = get_entities_for_fake_accounts(dataframe, 'full_name', 'is_fake')

# Print sorted entities by label
for label, entities in sorted(fake_entities.items()):
    print(f"{label}: {entities}")

GPE: ['mel', 'nazanin', 'tina']
NORP: ['mohaddese', 'nazi']
PERSON: ['alex tegzas', 'ali barahimi', 'ali noorifard', 'amir tataloo', 'asadzadeh hasan', 'hamsaye khoda', 'jiji bofan', 'jimigkh hug', 'kamii', 'khajeh mohammad', 'mahmoud samy', 'mahshid', 'mahyar', 'mariiii', 'melliii', 'sana', 'sana', 'shamila', 'tabrizsport', 'zari']
WORK_OF_ART: ['anjel avi']


In [75]:
dataframe['is_fake'] = dataframe['is_fake'].replace({True: 'fake', False: 'real'})
real_accounts = dataframe[dataframe['is_fake'] == 'real']
length_real = len(real_accounts)
fake_accounts = dataframe[dataframe['is_fake'] == 'fake']
length_fake = len(fake_accounts)

real_accounts_with_null_username = real_accounts[real_accounts['username'] == 'empty after cleaning']
length_real_with_null_username = len(real_accounts_with_null_username)
real_accounts_with_not_null_username = real_accounts[real_accounts['username'] != 'empty after cleaning']
length_real_without_null_username = len(real_accounts_with_not_null_username)
fake_accounts_with_null_username = fake_accounts[fake_accounts['username'] == 'empty after cleaning']
length_fake_with_null_username = len(fake_accounts_with_null_username)
fake_accounts_with_not_null_username = fake_accounts[fake_accounts['username'] != 'empty after cleaning']
length_fake_without_null_username = len(fake_accounts_with_not_null_username)

percentage_real = length_real/len(dataframe)
percentage_real_with_null_username = length_real_with_null_username/length_real
percentage_real_without_null_username = length_real_without_null_username/length_real
percentage_fake = length_fake/len(dataframe)
percentage_fake_with_null_username = length_fake_with_null_username/length_fake
percentage_fake_without_null_username= length_fake_without_null_username/length_fake

# Create a DataFrame for plotting
data = {
    'Account Type': ['Real', 'Real', 'Fake', 'Fake'],
    'Full_Name': ['Not Empty', 'Empty', 'Not Empty', 'Empty'],
    'Percentage': [
        percentage_real_without_null_username * 100,
        percentage_real_with_null_username * 100,
        percentage_fake_without_null_username * 100,
        percentage_fake_with_null_username * 100
    ]
}
plot_data = pd.DataFrame(data)

# Create a stacked bar chart using Plotly Express
fig = px.bar(plot_data, x='Account Type', y='Percentage', color='Full_Name',
             barmode='stack', labels={'Percentage': 'Percentage (%)'},color_discrete_sequence=px.colors.qualitative.Pastel,text='Percentage',
             title='Username Type for Accounts - After Cleaning')
fig.update_traces(texttemplate='%{text:.0f}%', textposition='inside')
fig.update_layout(width=500, height=400)
fig.show()

In [128]:
dataframe = dataframe[dataframe['username'] != "empty after cleaning"] 
dataframe.shape

(521, 4)

In [129]:
df_username= add_entity_counts_as_columns(dataframe, 'username')
df_username = df_username.drop(columns=['nr_PERSON'])
df_username.head()

Unnamed: 0,username,full_name,description,is_fake,nr_ORG,nr_GPE,nr_CARDINAL
0,lindalomelino,lomelino linda,cookbook author printsandpropsse photographer ...,real,0,0,0
1,angelicablick,angelica blick,la angelicablick angelicablicklivese currently...,real,0,0,0
2,giulianafortuna,gf,giulianafortunaicloudcom,real,0,0,0
3,kerriehessillustration,illustrator hess kerrie,louis cartier vuitton printemps saab art artis...,real,0,0,0
4,renanozturk,ozturk renan,natgeo thenorthface photographer climber heart...,real,0,0,0


In [130]:
from scipy import stats
import pandas as pd

# Assuming df_description contains your dataset

# Separate data for real and fake descriptions
real_descriptions = df_username[df_username['is_fake'] == 'real']['nr_GPE']
fake_descriptions = df_username[df_username['is_fake'] == 'fake']['nr_GPE']

# Perform one-tailed two-sample t-test assuming unequal variances
t_stat, p_value = stats.ttest_ind(real_descriptions, fake_descriptions, equal_var=False)

# Display the results
print("T-statistic:", t_stat)
print("P-value:", p_value / 2)  # Division by 2 for a one-tailed test

alpha = 0.05  # Set your desired significance level
if (p_value / 2) < alpha and t_stat < 0:  # Adjusted for the mean of real dataset being lower
    print("Reject the null hypothesis: The mean of real dataset is lower than fake dataset.")
else:
    print("Fail to reject the null hypothesis: The mean of real dataset is not lower than fake dataset.")


T-statistic: -0.6190207762600274
P-value: 0.26809261973394694
Fail to reject the null hypothesis: The mean of real dataset is not lower than fake dataset.


In [131]:
from scipy import stats
import pandas as pd

# Assuming df_description contains your dataset

# Separate data for real and fake descriptions
real_username = df_username[df_username['is_fake'] == 'real']['nr_GPE']
fake_username = df_username[df_username['is_fake'] == 'fake']['nr_GPE']

# Perform one-tailed two-sample t-test assuming unequal variances
t_stat, p_value = stats.ttest_ind(real_username, fake_username, equal_var=False)

# Display the results
print("T-statistic:", t_stat)
print("P-value:", p_value / 2)  # Division by 2 for a one-tailed test

alpha = 0.05  # Set your desired significance level
if (p_value / 2) < alpha and t_stat < 0:  # Reversed comparison logic for fake mean being higher
    print("Reject the null hypothesis: The mean of fake dataset is higher than real dataset.")
else:
    print("Fail to reject the null hypothesis: The mean of fake dataset is not higher than real dataset.")


T-statistic: -0.6190207762600274
P-value: 0.26809261973394694
Fail to reject the null hypothesis: The mean of fake dataset is not higher than real dataset.


In [132]:
nr_columns = [col for col in df_username.columns if col.startswith('nr_')]
sum_by_fake = df_username.groupby('is_fake')[nr_columns].sum().reset_index()
melted = pd.melt(sum_by_fake, id_vars='is_fake', var_name='Column', value_name='Total Value')
fig = px.bar(melted, x='Column', y='Total Value', color='is_fake', barmode='stack',color_discrete_sequence=px.colors.qualitative.Vivid,text = 'Total Value',
             title='Total number of entities per Account Type - Username', labels={'Column': 'Different entities', 'Total Value': 'Total Value'})
fig.update_layout(width=500, height=400)
fig.show()

In [54]:
fake_entities = get_entities_for_fake_accounts(dataframe, 'username', 'is_fake')

# Print sorted entities by label
for label, entities in sorted(fake_entities.items()):
    print(f"{label}: {entities}")

CARDINAL: ['nedabaghripv']
GPE: ['hamedosivand', 'hengamebaboli', 'kasbokarenternati', 'malikjan', 'mohammadberoo', 'naemefdaii', 'zahraabasi', 'zaiden']
ORG: ['akiop', 'aloneturkoglan', 'amirheydari', 'artcnc', 'hadipooransari', 'hesamamohammdi', 'maryamm', 'narmakma', 'nemat', 'raknrayfar', 'sheyniorg', 'tataloo', 'vidayari']
PERSON: ['javadhydari', 'llsanall', 'mahyar', 'marinr', 'mehrad', 'mogaddam', 'nonami', 'sana', 'sarajavadiiim', 'solmzzlf', 'zahraz', 'zari']
TIME: ['mhtbjoon']
