In [51]:
import pandas as pd
import numpy as np

df = pd.read_csv('Information.csv')

#visualizing the data
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               20050 non-null  int64  
 1   _golden                20050 non-null  bool   
 2   _unit_state            20050 non-null  object 
 3   _trusted_judgments     20050 non-null  int64  
 4   _last_judgment_at      20000 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      20024 non-null  float64
 7   profile_yn             20050 non-null  object 
 8   profile_yn:confidence  20050 non-null  float64
 9   created                20050 non-null  object 
 10  description            16306 non-null  object 
 11  fav_number             20050 non-null  int64  
 12  gender_gold            50 non-null     object 
 13  link_color             20050 non-null  object 
 14  name                   20050 non-null  object 
 15  pr

Unnamed: 0,_unit_id,_trusted_judgments,gender:confidence,profile_yn:confidence,fav_number,retweet_count,tweet_count,tweet_id
count,20050.0,20050.0,20024.0,20050.0,20050.0,20050.0,20050.0,20050.0
mean,815729400.0,3.615711,0.882756,0.993221,4382.201646,0.079401,38924.69,6.58735e+17
std,6000.801,12.33189,0.191403,0.047168,12518.575919,2.649751,116837.1,5000124000000.0
min,815719200.0,3.0,0.0,0.6272,0.0,0.0,1.0,6.5873e+17
25%,815724300.0,3.0,0.6778,1.0,11.0,0.0,2398.0,6.5873e+17
50%,815729400.0,3.0,1.0,1.0,456.0,0.0,11441.5,6.5873e+17
75%,815734500.0,3.0,1.0,1.0,3315.5,0.0,40027.5,6.5874e+17
max,815758000.0,274.0,1.0,1.0,341621.0,330.0,2680199.0,6.5874e+17


In [52]:
#keeping rows whose 'gender:confidence' value is >0.99
df = df[df['gender:confidence'] > 0.99]

#keeping only male and female genders
df = df[df['gender'].isin(['male', 'female'])]

#dropping all rows that either contain similar values or are unnecessary
df = df.drop(['_unit_id','_golden','_unit_state','_trusted_judgments','profile_yn','profile_yn:confidence','gender_gold','profile_yn_gold','profileimage','retweet_count','tweet_created','tweet_coord','tweet_id','tweet_location'],axis =1)

#dropping all null values
df_filtered = df.dropna(how='any',axis=0)

#visualizing the cleaned data
df_filtered.info()
df_filtered.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6108 entries, 0 to 19995
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   _last_judgment_at  6108 non-null   object 
 1   gender             6108 non-null   object 
 2   gender:confidence  6108 non-null   float64
 3   created            6108 non-null   object 
 4   description        6108 non-null   object 
 5   fav_number         6108 non-null   int64  
 6   link_color         6108 non-null   object 
 7   name               6108 non-null   object 
 8   sidebar_color      6108 non-null   object 
 9   text               6108 non-null   object 
 10  tweet_count        6108 non-null   int64  
 11  user_timezone      6108 non-null   object 
dtypes: float64(1), int64(2), object(9)
memory usage: 620.3+ KB


Unnamed: 0,gender:confidence,fav_number,tweet_count
count,6108.0,6108.0,6108.0
mean,1.0,6466.656516,34942.61
std,0.0,14308.402548,78057.6
min,1.0,0.0,1.0
25%,1.0,294.0,4546.0
50%,1.0,1674.5,14065.0
75%,1.0,6485.0,38001.0
max,1.0,341621.0,2680199.0


In [53]:
from spellchecker import SpellChecker
from nltk.tokenize import RegexpTokenizer

#these functions will check each word for spelling errors
spell = SpellChecker()
tokenizer = RegexpTokenizer(r'\w+')

#function for removing urls, hashtags and mentions
def remove_mention_url(text):
    clean = []
    words = text.split()
    for word in words:
        if not word.startswith('@') and not word.startswith('http') and not word.startswith('www') and not word.startswith('#'):
            clean.append(word)
    return ' '.join(clean)


#creating a new column 'typos' which stores the number of typos in each tweet
df_filtered['typos'] = df_filtered['text'].apply(lambda x: len(spell.unknown(tokenizer.tokenize(remove_mention_url(x)))))

df_filtered['typos']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['typos'] = df_filtered['text'].apply(lambda x: len(spell.unknown(tokenizer.tokenize(remove_mention_url(x)))))


0        0
1        2
3        1
5        0
9        0
        ..
19989    0
19990    1
19993    0
19994    0
19995    0
Name: typos, Length: 6108, dtype: int64

In [54]:
#filtering out all the typos written by males
df_male = df_filtered[df_filtered['gender'] == 'male']
df_male_typos = pd.DataFrame(df_male,columns=['typos'])

#calculating average number of typos in tweets by males
av_male_typos = df_male_typos.mean(axis=0)
print (av_male_typos)

#filtering out all the typos written by females
df_female = df_filtered[df_filtered['gender'] == 'female']
df_female_typos = pd.DataFrame(df_female,columns=['typos'])

#calculating average number of typos in tweets by females
av_female_typos = df_female_typos.mean(axis=0)
print (av_female_typos)

typos    0.829219
dtype: float64
typos    0.928594
dtype: float64


In [55]:
#finding out the most common sidebar colour for male entries
df_male = df_filtered[df_filtered['gender'] == 'male']
df_male['sidebar_color'].value_counts()

C0DEED    871
FFFFFF    727
0         644
EEEEEE    179
181A1E    115
         ... 
F50A29      1
44EBA8      1
892DD8      1
161C1A      1
93BB4B      1
Name: sidebar_color, Length: 140, dtype: int64

In [56]:
#finding out the most common sidebar colour for female entries
df_female = df_filtered[df_filtered['gender'] == 'female']
df_female['sidebar_color'].value_counts()

FFFFFF    989
0         805
C0DEED    545
EEEEEE    138
65B0DA     79
         ... 
8F1344      1
F7F2F4      1
F0E661      1
FF00E1      1
070A09      1
Name: sidebar_color, Length: 215, dtype: int64

In [57]:
from spellchecker import SpellChecker
from nltk.tokenize import RegexpTokenizer

spell = SpellChecker()
tokenizer = RegexpTokenizer(r'\w+')

# function that will remove URLs and keep only words, mentions and hashtags

def remove_url(text):
    clean = []
    if text == None:
        return
    words = text.split()
    for word in words:
        if not word.startswith('http') and not word.startswith('www'):
            clean.append(word)
    return ' '.join(clean)


#function that will clean the 'text' and 'description' columns and concatenate them into a single column

def data_prep_for_nb(df):
    df.dropna(subset=['text', 'description'], inplace=True)
    df['text_normalized'] = df['text'].apply(lambda x: ' '.join(tokenizer.tokenize(remove_url(x))))
    df['description'] = df['description'].astype(str)
    df['description_normalized'] = df['description'].apply(lambda x: ' '.join(tokenizer.tokenize(remove_url(x))))
    df['all_text_features'] = df['text_normalized'].str.cat(df['description_normalized'], sep=' ')
    df['all_text_features']

data_prep_for_nb(df_filtered)    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['text', 'description'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_normalized'] = df['text'].apply(lambda x: ' '.join(tokenizer.tokenize(remove_url(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description'] = df['description'].astype(str)
A value is trying to be set on a copy of a slice from a Data

In [58]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

# function for transforming documents into counts
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')

#transforming 'all_text_features' into a sparse matrix 
x = vectorizer.fit_transform(df_filtered['all_text_features'])

#running LabelEncoder function on 'gender'
y = encoder.fit_transform(df_filtered['gender'])

#splitting the data

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [59]:
from sklearn.naive_bayes import MultinomialNB

#running Naive-Bayes algorithm with the given data
nb = MultinomialNB()
nb.fit(x_train, y_train)

#printing the accuracy of the model
print(nb.score(x_test, y_test))

0.679109364767518


In [60]:
link_color_encoder = LabelEncoder()
sidebar_color_encoder = LabelEncoder()

#running LabelEncoder function on 'link_color' and 'sidebar_color' columns
df_filtered['link_color_e'] = link_color_encoder.fit_transform(df_filtered['link_color'])
df_filtered['sidebar_color_e'] = sidebar_color_encoder.fit_transform(df_filtered['sidebar_color'])

timezone_encoder = LabelEncoder()

#forcibly changing all entries in 'user_timezone' column to string
df_filtered['user_timezone'] = df_filtered['user_timezone'].astype(str)

#running LabelEncoder function on 'user_timezone' column
df_filtered['user_timezone_e'] = timezone_encoder.fit_transform(df_filtered['user_timezone'])

from sklearn.model_selection import train_test_split

#passing independent variables through X
X = df_filtered[['typos', 'tweet_count', 'link_color_e', 'sidebar_color_e', 'user_timezone_e']].copy()

#passing dependent variable, i.e., gender, through Y
Y = df_filtered[['gender']].copy()

#splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['link_color_e'] = link_color_encoder.fit_transform(df_filtered['link_color'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['sidebar_color_e'] = sidebar_color_encoder.fit_transform(df_filtered['sidebar_color'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['use

In [61]:
from sklearn.linear_model import LogisticRegression
LogReg = LogisticRegression()

# training Linear Regression model on training data
LogReg.fit(X_train, Y_train)

#calculating accuracy of the model in the form of log loss
y_pred = LogReg.predict(X_test)
ypred_prob = LogReg.predict_proba(X_test)

from sklearn.metrics import log_loss
log_loss(Y_test, ypred_prob)

  return f(**kwargs)


0.6779802974374961

In [62]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

# training Random Forest Classifier model on training data
rfc.fit(X_train, Y_train)

# calculating accuracy for the model
from sklearn import metrics

y_pred = rfc.predict(X_test)
print("Test set Accuracy: ", metrics.accuracy_score(Y_test, y_pred))

  rfc.fit(X_train, Y_train)


Test set Accuracy:  0.610347085789129


In [64]:
# This is the input that's used in MultinomialNB prediction, LogisticRegression prediction and RandomForestClassifier prediction 
test_row = {'text': ['This is greaaat! @apple http://apple.com'], 'description': ['ML is wonderful'], 'tweet_count': [400], 'sidebar_color': ['C0DEED'], 'link_color': ['0084B4'], 'user_timezone': ['Pacific Time (US & Canada)'] }
test_df = pd.DataFrame(data=test_row)
test_df

Unnamed: 0,text,description,tweet_count,sidebar_color,link_color,user_timezone
0,This is greaaat! @apple http://apple.com,ML is wonderful,400,C0DEED,0084B4,Pacific Time (US & Canada)


In [65]:
encoder = LabelEncoder()

# predicting the gender using MultinomialNB
data_prep_for_nb(test_df)
real_test_x = vectorizer.transform(test_df['all_text_features'])
real_test_x
y_predicted = nb.predict(real_test_x)
encoder.inverse_transform(y_predicted)

array(['male'], dtype=object)

In [66]:
link_color_encoder = LabelEncoder()
sidebar_color_encoder = LabelEncoder()
timezone_encoder = LabelEncoder()

#running LabelEncoder the link_color and sidebar_color columns of test_df 
test_df['link_color_e'] = link_color_encoder.transform(test_df['link_color'])
test_df['sidebar_color_e'] = sidebar_color_encoder.transform(test_df['sidebar_color'])

#converting all entries of user_timezone column of test_df to string and then running LabelEncoder function on it 
test_df['user_timezone'] = test_df['user_timezone'].astype(str)
test_df['user_timezone_e'] = timezone_encoder.transform(test_df['user_timezone'])

#calculating the number of typos in the text column of test_df
test_df['typos'] = test_df['text'].apply(lambda x: len(spell.unknown(tokenizer.tokenize(remove_mention_url(x)))))

In [67]:
#predicting the data using LogisticRegression
df_with_feateng = test_df[['typos', 'tweet_count', 'link_color_e', 'sidebar_color_e', 'user_timezone_e']].copy()
LogReg.predict(df_with_feateng)

array(['male'], dtype=object)

In [68]:
#predicting the data using random forest classifier
rfc.predict(df_with_feateng)

array(['female'], dtype=object)