# Complete Data

In [1]:
# Loading in required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# This makes plots appear in the notebook
%matplotlib inline

## Data Cleaning  

In [2]:
df = pd.read_csv('thesis_combined_final.csv')

# dropping first 2 rows
df = df.drop(df.index[ :2])

# dropping questions 19 and 23 with many NAN
df = df.drop(['19', '23'], axis=1)

# re-labeling questions
df = df.rename(columns ={    
    '24':'Age',
    '25':'Position',
    '26':'Politics',
    '18':'Activities',
    '20': 'Arab_num',
    '21':'Lunch',
    '22':'Lunch_mates',
    '27':'Demo',
    '28': 'Jewish Pride',
    '29': 'Arab Pride'
})

# Re-labeling 'Politics' column
df.columns.values[-4] = 'Politics'

# Consolidating Demo column to Jewish or Arab
replacements = {'5,6': 'Jewish', 
                '5':'Jewish',
                '6': 'Jewish', 
                '1,2,4,7': 'Arab', 
                '1,3,4,7': 'Arab', 
                '1': 'Arab', 
                '2,7': 'Arab',
                '1,2,3,4,5,6,7': np.nan,
                '5,6': 'Jewish', 
                '5':'Jewish',
                '6': 'Jewish', 
                '1,2,4,7': 'Arab', 
                '1,3,4,7': 'Arab', 
                '1': 'Arab', 
                '2,7': 'Arab',
                '1,2,3,4,5,6,7': np.nan,
                '1,3,7': 'Arab', 
                '1,2,4': 'Arab', 
                'nan': 'NaN', 
                '3,7': 'Arab', 
                '1,3,4': 'Arab', 
                '1,5,7': 'Arab',
                '1,3,4,5,7': 'Arab', 
                '1,2,4,5,7': 'Arab',
                'ישראל':'Jewish',
                'ישראלי':'Jewish',
                'יהודי':'Jewish',
                'ישראלי,יהודי':'Jewish',
                'ערבי,מוסלמי,פלסטיני,ערבי ישראלי':'Arab',
                'ערבי,נוצרי,ערבי ישראלי':'Arab',
                'ערבי,מוסלמי,פלסטיני':'Arab',
                'ערבי,נוצרי,פלסטיני,ישראלי,ערבי ישראלי':'Arab',
                'נוצרי,ערבי ישראלי':'Arab',
                'ערבי,נוצרי,פלסטיני':'Arab',
                'ערבי,ישראלי,ערבי ישראלי':'Arab',
                'ערבי,מוסלמי,פלסטיני,ישראלי,ערבי ישראלי':'Arab',
                'ערבי':'Arab',
                'Israeli':'Jewish', 
                'Arab,Muslim,Palestinian':'Arab',
                'Arab':'Arab',
                'Muslim,Arab Israeli':'Arab',
                'Arab,Muslim,Christian,Palestinian,Israeli,Jewish,Arab Israeli':'Arab',
                'Arab,Muslim,Palestinian,Arab Israeli':'Arab',
                'Jewish':'Jewish',
                'Israeli,Jewish': 'Jewish'
}

df['Demo'] = df['Demo'].replace(replacements)

# Q22 How many Arab Israelis work at your current company? (change column to 1,2,3,4)

df['Arab_num'] = df['Arab_num'].replace('מעל 30', 'Over 30') # Changing hebrew string in Arab_num to a float

replacements2 = {'0-5':'1',
                 '5-10':'2',
                 '10-30':'3',
                 'Over 30':'4',
                 '05-Oct': np.nan,
                 'Oct-30': np.nan,
                }

df['Arab_num'] = df['Arab_num'].replace(replacements2)

### Changing columns to numeric values

In [3]:
# Columns with only numeric values
columns_numeric = ['2', '3', '4','5', '6','7', '8', '9', '10', 
                   '11', '12', '13', '14', '15', '16', '17', 
                   'Arab_num']
df[columns_numeric] = df[columns_numeric].astype(float)

# Column 1
df['1'] = df['1'].replace("Completely agree\n", "", regex=True)
df['1'] = df['1'].astype(float)
                   
# Column Politics
replacements3 = {'1\n':'1',
                 'מרכז\n4':'4',
                 'ימין מתון\n3':'3', 
                 'שמאל מתון\n4':'4', 
                 'ימין\n2':'2', 
                 'שמאל\n6':'6',
                 'שמאל קיצוני\n7':'7'
}

df['Politics'] = df['Politics'].replace(replacements3)
df['Politics'] = df['Politics'].astype(float)

# Age
df['Age'] = df['Age'].astype(float)

# Pride
df['Jewish Pride'] = df['Jewish Pride'].replace('מסכים/מה לחלוטין\n6', '6')
df['Jewish Pride'] = df['Jewish Pride'].astype(float)

df['Arab Pride'] = df['Arab Pride'].replace('מסכים/מה לחלוטין\n6', '6')
df['Arab Pride'] = df['Arab Pride'].replace('לא מסכים/מה בכלל\n1', '1')
df['Arab Pride'] = df['Arab Pride'].astype(float)

# Checking work
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 2 to 269
Data columns (total 28 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   UserLanguage  268 non-null    object 
 1   1             202 non-null    float64
 2   2             202 non-null    float64
 3   3             198 non-null    float64
 4   4             202 non-null    float64
 5   5             202 non-null    float64
 6   6             202 non-null    float64
 7   7             202 non-null    float64
 8   8             202 non-null    float64
 9   9             202 non-null    float64
 10  10            202 non-null    float64
 11  11            202 non-null    float64
 12  12            202 non-null    float64
 13  13            202 non-null    float64
 14  14            202 non-null    float64
 15  15            200 non-null    float64
 16  16            202 non-null    float64
 17  17            202 non-null    float64
 18  Activities    202 non-null    

In [4]:
# inversing points order of questions ['8', '10', '16']
replacement_dict = {
    6: 1,
    5: 2,
    4: 3,
    1: 6,
    2: 5,
    3: 4
}

df['8'] = df['8'].replace(replacement_dict)
df['10'] = df['10'].replace(replacement_dict)
df['16'] = df['16'].replace(replacement_dict)

In [13]:
# grouping political data
def categorize_politics(value):
    if value in [1.0, 2.0, 3.0]:
        return 'right'
    elif value in [5.0, 6.0, 7.0]:
        return 'left'
    elif value in [4.0]:
        return 'center'
    else:
        return np.na

df['affiliation'] = df['Politics'].apply(categorize_politics)
df.affiliation.value_counts()

right     76
center    51
left      43
Name: affiliation, dtype: int64

In [14]:
# Manipulating data for only Jewish
df = df[df.Demo == "Jewish"]

## Date By Political Affiliation

In [16]:
from scipy.stats import ttest_ind

results = []

for column in range(1, 18):
    right_mean = df[df['affiliation'] == 'right'][str(column)].mean()
    left_mean = df[df['affiliation'] == 'left'][str(column)].mean()
    center_mean = df[df['affiliation'] == 'center'][str(column)].mean()

    t_statistic_right_left, p_value_right_left = ttest_ind(df[df['affiliation'] == 'right'][str(column)],
                                                           df[df['affiliation'] == 'left'][str(column)],
                                                           nan_policy='omit')

    t_statistic_right_center, p_value_right_center = ttest_ind(df[df['affiliation'] == 'right'][str(column)],
                                                               df[df['affiliation'] == 'center'][str(column)],
                                                               nan_policy='omit')

    t_statistic_left_center, p_value_left_center = ttest_ind(df[df['affiliation'] == 'left'][str(column)],
                                                             df[df['affiliation'] == 'center'][str(column)],
                                                             nan_policy='omit')

    results.append({
        'Column': str(column),
        'Right Mean': right_mean,
        'Left Mean': left_mean,
        'Center Mean': center_mean,
        'Right vs Left p-value': p_value_right_left,
        'Right vs Center p-value': p_value_right_center,
        'Left vs Center p-value': p_value_left_center,
        'Right vs Left Significant': p_value_right_left < 0.05,
        'Right vs Center Significant': p_value_right_center < 0.05,
        'Left vs Center Significant': p_value_left_center < 0.05
    })

results_df = pd.DataFrame(results)
results_df = results_df[['Column', 'Right Mean', 'Left Mean', 'Center Mean',
                         'Right vs Left p-value', 'Right vs Left Significant',
                         'Right vs Center p-value', 'Right vs Center Significant',
                         'Left vs Center p-value', 'Left vs Center Significant']]

results_df

Unnamed: 0,Column,Right Mean,Left Mean,Center Mean,Right vs Left p-value,Right vs Left Significant,Right vs Center p-value,Right vs Center Significant,Left vs Center p-value,Left vs Center Significant
0,1,3.763158,4.72093,4.098039,0.001224947,True,0.208976,False,0.006935891,True
1,2,4.184211,4.883721,4.352941,0.01774454,True,0.5278937,False,0.0223903,True
2,3,3.432432,4.604651,3.686275,8.791798e-06,True,0.2653718,False,1.957638e-05,True
3,4,4.736842,5.046512,4.72549,0.1415274,False,0.956006,False,0.178005,False
4,5,4.736842,4.604651,4.54902,0.5786363,False,0.3772107,False,0.8037083,False
5,6,4.315789,4.348837,3.745098,0.9066682,False,0.02700298,True,0.04950155,True
6,7,5.631579,4.767442,4.803922,3.094183e-07,True,3.163223e-06,True,0.8795615,False
7,8,3.473684,4.511628,4.352941,0.0003548334,True,0.000523634,True,0.5335599,False
8,9,3.184211,4.581395,4.333333,3.531715e-08,True,3.200718e-08,True,0.3230543,False
9,10,2.315789,2.860465,2.921569,0.01733104,True,0.001834575,True,0.7695855,False


In [17]:
from scipy.stats import ttest_ind

groups = {
    'Work_Attitude_and_Motivation': ['1', '2', '3', '4'],
    'Work_Relationships': ['5', '6'],
    'Self_Identity_and_Perception': ['7', '9'],
    'Inter_group_Relationships': ['11', '12', '13', '14'],
    'Equality_and_Fairness': ['15', '17'],
    'Inter_Group_Anxiety': ['8', '10', '16']
}

results = []

for group, columns in groups.items():
    right_mean = df[df['affiliation'] == 'right'][columns].mean().mean()
    left_mean = df[df['affiliation'] == 'left'][columns].mean().mean()
    center_mean = df[df['affiliation'] == 'center'][columns].mean().mean()

    t_statistic_right_left, p_value_right_left = ttest_ind(df[df['affiliation'] == 'right'][columns].stack(),
                                                           df[df['affiliation'] == 'left'][columns].stack(),
                                                           nan_policy='omit')

    t_statistic_right_center, p_value_right_center = ttest_ind(df[df['affiliation'] == 'right'][columns].stack(),
                                                               df[df['affiliation'] == 'center'][columns].stack(),
                                                               nan_policy='omit')

    t_statistic_left_center, p_value_left_center = ttest_ind(df[df['affiliation'] == 'left'][columns].stack(),
                                                             df[df['affiliation'] == 'center'][columns].stack(),
                                                             nan_policy='omit')

    results.append({
        'Group': group,
        'Right Mean': right_mean,
        'Left Mean': left_mean,
        'Center Mean': center_mean,
        'Right vs Left p-value': p_value_right_left,
        'Right vs Left Significant': p_value_right_left < 0.05,
        'Right vs Center p-value': p_value_right_center,
        'Right vs Center Significant': p_value_right_center < 0.05,
        'Left vs Center p-value': p_value_left_center,
        'Left vs Center Significant': p_value_left_center < 0.05
    })

results_df = pd.DataFrame(results)
results_df = results_df[['Group', 'Right Mean', 'Left Mean', 'Center Mean',
                         'Right vs Left p-value', 'Right vs Left Significant',
                         'Right vs Center p-value', 'Right vs Center Significant',
                         'Left vs Center p-value', 'Left vs Center Significant']]

results_df

Unnamed: 0,Group,Right Mean,Left Mean,Center Mean,Right vs Left p-value,Right vs Left Significant,Right vs Center p-value,Right vs Center Significant,Left vs Center p-value,Left vs Center Significant
0,Work_Attitude_and_Motivation,4.029161,4.813953,4.215686,1.654716e-08,True,0.1516722,False,3.534719e-07,True
1,Work_Relationships,4.526316,4.476744,4.147059,0.7889029,False,0.02625577,True,0.0887616,False
2,Self_Identity_and_Perception,4.407895,4.674419,4.568627,0.1715603,False,0.3675732,False,0.5448085,False
3,Inter_group_Relationships,3.407895,4.546512,3.568627,1.590057e-15,True,0.2330472,False,6.26582e-14,True
4,Equality_and_Fairness,4.539474,4.872093,4.378752,0.08500128,False,0.3673633,False,0.008481215,True
5,Inter_Group_Anxiety,2.754386,3.767442,3.620915,1.513241e-09,True,1.038709e-08,True,0.3863862,False


In [8]:
# 1) Compare means based on political views
# 2) Correlation betweeen political view and each question. 
# 3) Reverse inverse answers such that 1 = 6, 2 = 5...
# 3) Try English and Hebrew again 