# Complete Data

In [18]:
# Loading in required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# This makes plots appear in the notebook
%matplotlib inline

## Data Cleaning  

In [19]:
df = pd.read_csv('thesis_combined_final.csv')

# dropping first 2 rows
df = df.drop(df.index[ :2])

# dropping questions 19 and 23 with many NAN
df = df.drop(['19', '23'], axis=1)

# re-labeling questions
df = df.rename(columns ={    
    '24':'Age',
    '25':'Position',
    '26':'Politics',
    '18':'Activities',
    '20': 'Arab_num',
    '21':'Lunch',
    '22':'Lunch_mates',
    '27':'Demo',
    '28': 'Jewish Pride',
    '29': 'Arab Pride'
})

# Re-labeling 'Politics' column
df.columns.values[-4] = 'Politics'

# Consolidating Demo column to Jewish or Arab
replacements = {'5,6': 'Jewish', 
                '5':'Jewish',
                '6': 'Jewish', 
                '1,2,4,7': 'Arab', 
                '1,3,4,7': 'Arab', 
                '1': 'Arab', 
                '2,7': 'Arab',
                '1,2,3,4,5,6,7': np.nan,
                '5,6': 'Jewish', 
                '5':'Jewish',
                '6': 'Jewish', 
                '1,2,4,7': 'Arab', 
                '1,3,4,7': 'Arab', 
                '1': 'Arab', 
                '2,7': 'Arab',
                '1,2,3,4,5,6,7': np.nan,
                '1,3,7': 'Arab', 
                '1,2,4': 'Arab', 
                'nan': 'NaN', 
                '3,7': 'Arab', 
                '1,3,4': 'Arab', 
                '1,5,7': 'Arab',
                '1,3,4,5,7': 'Arab', 
                '1,2,4,5,7': 'Arab',
                'ישראל':'Jewish',
                'ישראלי':'Jewish',
                'יהודי':'Jewish',
                'ישראלי,יהודי':'Jewish',
                'ערבי,מוסלמי,פלסטיני,ערבי ישראלי':'Arab',
                'ערבי,נוצרי,ערבי ישראלי':'Arab',
                'ערבי,מוסלמי,פלסטיני':'Arab',
                'ערבי,נוצרי,פלסטיני,ישראלי,ערבי ישראלי':'Arab',
                'נוצרי,ערבי ישראלי':'Arab',
                'ערבי,נוצרי,פלסטיני':'Arab',
                'ערבי,ישראלי,ערבי ישראלי':'Arab',
                'ערבי,מוסלמי,פלסטיני,ישראלי,ערבי ישראלי':'Arab',
                'ערבי':'Arab',
                'Israeli':'Jewish', 
                'Arab,Muslim,Palestinian':'Arab',
                'Arab':'Arab',
                'Muslim,Arab Israeli':'Arab',
                'Arab,Muslim,Christian,Palestinian,Israeli,Jewish,Arab Israeli':'Arab',
                'Arab,Muslim,Palestinian,Arab Israeli':'Arab',
                'Jewish':'Jewish',
                'Israeli,Jewish': 'Jewish'
}

df['Demo'] = df['Demo'].replace(replacements)

# Q22 How many Arab Israelis work at your current company? (change column to 1,2,3,4)

df['Arab_num'] = df['Arab_num'].replace('מעל 30', 'Over 30') # Changing hebrew string in Arab_num to a float

replacements2 = {'0-5':'1',
                 '5-10':'2',
                 '10-30':'3',
                 'Over 30':'4',
                 '05-Oct': np.nan,
                 'Oct-30': np.nan,
                }

df['Arab_num'] = df['Arab_num'].replace(replacements2)

### Changing columns to numeric values

In [20]:
# Columns with only numeric values
columns_numeric = ['2', '3', '4','5', '6','7', '8', '9', '10', 
                   '11', '12', '13', '14', '15', '16', '17', 
                   'Arab_num']
df[columns_numeric] = df[columns_numeric].astype(float)

# Column 1
df['1'] = df['1'].replace("Completely agree\n", "", regex=True)
df['1'] = df['1'].astype(float)
                   
# Column Politics
replacements3 = {'1\n':'1',
                 'מרכז\n4':'4',
                 'ימין מתון\n3':'3', 
                 'שמאל מתון\n4':'4', 
                 'ימין\n2':'2', 
                 'שמאל\n6':'6',
                 'שמאל קיצוני\n7':'7'
}

df['Politics'] = df['Politics'].replace(replacements3)
df['Politics'] = df['Politics'].astype(float)

# Age
df['Age'] = df['Age'].astype(float)

# Pride
df['Jewish Pride'] = df['Jewish Pride'].replace('מסכים/מה לחלוטין\n6', '6')
df['Jewish Pride'] = df['Jewish Pride'].astype(float)

df['Arab Pride'] = df['Arab Pride'].replace('מסכים/מה לחלוטין\n6', '6')
df['Arab Pride'] = df['Arab Pride'].replace('לא מסכים/מה בכלל\n1', '1')
df['Arab Pride'] = df['Arab Pride'].astype(float)

# Checking work
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 2 to 269
Data columns (total 28 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   UserLanguage  268 non-null    object 
 1   1             202 non-null    float64
 2   2             202 non-null    float64
 3   3             198 non-null    float64
 4   4             202 non-null    float64
 5   5             202 non-null    float64
 6   6             202 non-null    float64
 7   7             202 non-null    float64
 8   8             202 non-null    float64
 9   9             202 non-null    float64
 10  10            202 non-null    float64
 11  11            202 non-null    float64
 12  12            202 non-null    float64
 13  13            202 non-null    float64
 14  14            202 non-null    float64
 15  15            200 non-null    float64
 16  16            202 non-null    float64
 17  17            202 non-null    float64
 18  Activities    202 non-null    

In [21]:
# inversing points order of questions ['8', '10', '16']
replacement_dict = {
    6: 1,
    5: 2,
    4: 3,
    1: 6,
    2: 5,
    3: 4
}

df['8'] = df['8'].replace(replacement_dict)
df['10'] = df['10'].replace(replacement_dict)
df['16'] = df['16'].replace(replacement_dict)

In [23]:
import numpy as np

# Grouping political data
def categorize_politics(value):
    if value in [1.0, 2.0]:
        return 'right'
    elif value in [6.0, 7.0]:
        return 'left'
    elif value in [3.0, 4.0, 5.0]:
        return 'center'
    else:
        return np.nan  # Use np.nan to represent NaN values

# Apply the categorization function to create the 'affiliation' column
df['affiliation'] = df['Politics'].apply(categorize_politics)

# Count the values in the 'affiliation' column
affiliation_counts = df['affiliation'].value_counts()
print(affiliation_counts)

center    107
right      54
left       38
Name: affiliation, dtype: int64


In [24]:
# Manipulating data for only Jewish
df = df[df.Demo == "Jewish"]

## Data By Political Affiliation

In [28]:
# Assume you have already added the 'affiliation' column to your DataFrame 'df'

from scipy.stats import ttest_ind

results = []

for column in range(1, 18):
    right_mean = df[df['affiliation'] == 'right'][str(column)].mean()
    left_mean = df[df['affiliation'] == 'left'][str(column)].mean()
    center_mean = df[df['affiliation'] == 'center'][str(column)].mean()

    t_statistic_right_left, p_value_right_left = ttest_ind(df[df['affiliation'] == 'right'][str(column)],
                                                           df[df['affiliation'] == 'left'][str(column)],
                                                           nan_policy='omit')

    t_statistic_right_center, p_value_right_center = ttest_ind(df[df['affiliation'] == 'right'][str(column)],
                                                               df[df['affiliation'] == 'center'][str(column)],
                                                               nan_policy='omit')

    t_statistic_left_center, p_value_left_center = ttest_ind(df[df['affiliation'] == 'left'][str(column)],
                                                             df[df['affiliation'] == 'center'][str(column)],
                                                             nan_policy='omit')

    results.append({
        'Column': str(column),
        'Right Mean': right_mean,
        'Left Mean': left_mean,
        'Center Mean': center_mean,
        'Right vs Left p-value': p_value_right_left,
        'Right vs Center p-value': p_value_right_center,
        'Left vs Center p-value': p_value_left_center,
        'Right vs Left Significant': p_value_right_left < 0.05,
        'Right vs Center Significant': p_value_right_center < 0.05,
        'Left vs Center Significant': p_value_left_center < 0.05
    })

results_df = pd.DataFrame(results)
results_df = results_df[['Column', 'Right Mean', 'Left Mean', 'Center Mean',
                         'Right vs Left p-value', 'Right vs Left Significant',
                         'Right vs Center p-value', 'Right vs Center Significant',
                         'Left vs Center p-value', 'Left vs Center Significant']]

results_df

Unnamed: 0,Column,Right Mean,Left Mean,Center Mean,Right vs Left p-value,Right vs Left Significant,Right vs Center p-value,Right vs Center Significant,Left vs Center p-value,Left vs Center Significant
0,1,3.851852,5.230769,3.933333,0.0004731098,True,0.7436703,False,1.987484e-07,True
1,2,4.185185,5.307692,4.288889,0.003146212,True,0.6784503,False,0.0001065667,True
2,3,3.269231,4.923077,3.8,2.023067e-06,True,0.01371758,True,2.13806e-06,True
3,4,4.777778,5.153846,4.733333,0.1436415,False,0.8203741,False,0.1012632,False
4,5,4.925926,4.923077,4.4,0.9916808,False,0.00978461,True,0.03844313,True
5,6,4.518519,4.769231,3.755556,0.4512477,False,0.002326433,True,0.001431117,True
6,7,5.740741,4.692308,4.955556,7.450425e-07,True,2.391711e-06,True,0.2972855,False
7,8,3.333333,4.538462,4.244444,0.001808316,True,0.0001285968,True,0.2996039,False
8,9,3.0,4.615385,4.2,2.467226e-06,True,2.539447e-09,True,0.1141482,False
9,10,2.074074,2.846154,2.911111,0.00619917,True,2.415538e-06,True,0.7857311,False


In [44]:
from scipy.stats import ttest_ind
import pandas as pd
import numpy as np

# Define your groups
groups = {  
    'attitude_towards_teamwork': ['1'],
    'work_motivation': ['2'],
    'connection_to_workplace': ['4'],
    'work_relationships': ['5', '6', '17'],  # throwing out question q25 (21), 'Lunch', and 'Lunch_mates'
    'Self_identity': ['7', 'Jewish Pride'], 
    'attitude towards outgroup': ['8', '9', '10', '11', '12', '13', '15']
}

# Initialize a list to store results
results = []

# Define the affiliations
affiliations = ['right', 'left', 'center']

# Create a Jupyter Notebook-friendly output using Markdown
output = []

# Loop through the groups
for group_name, columns in groups.items():
    # Loop through affiliations
    for i, affiliation in enumerate(affiliations):
        for j in range(i + 1, len(affiliations)):
            # Extract the subgroup columns for the specific affiliations
            subgroup1 = df[df['affiliation'] == affiliations[i]][columns]
            subgroup2 = df[df['affiliation'] == affiliations[j]][columns]

            # Convert the subgroup columns to numeric and handle non-numeric values or NaNs
            subgroup1 = subgroup1.apply(pd.to_numeric, errors='coerce')
            subgroup2 = subgroup2.apply(pd.to_numeric, errors='coerce')

            # Check if there are NaN values after conversion
            if subgroup1.isna().any().any() or subgroup2.isna().any().any():
                # Handle cases where the subgroups contain NaN values
                continue

            # Initialize lists to store p-values and significances for each column
            p_values = []
            significances = []

            # Perform t-tests for each column
            for col in subgroup1.columns:
                t_statistic, p_value = ttest_ind(subgroup1[col], subgroup2[col], nan_policy='omit')
                p_values.append(p_value)
                if p_value < 0.05:
                    significances.append('Significant')
                else:
                    significances.append('Not Significant')

            # Create a Markdown table for the comparison
            table = pd.DataFrame({
                'Metric': columns,
                f'{affiliations[i]} Mean': subgroup1.mean().values,
                f'{affiliations[j]} Mean': subgroup2.mean().values,
                'p-values': p_values,
                'Significances': significances
            }).to_markdown(index=False)

            # Store the Markdown table in the output list
            output.append(f"## {group_name} - {affiliations[i]} vs {affiliations[j]}\n\n{table}\n")

# Print the Markdown-formatted output
from IPython.display import Markdown
Markdown("\n".join(output))


## attitude_towards_teamwork - right vs left

|   Metric |   right Mean |   left Mean |   p-values | Significances   |
|---------:|-------------:|------------:|-----------:|:----------------|
|        1 |      3.85185 |     5.23077 | 0.00047311 | Significant     |

## attitude_towards_teamwork - right vs center

|   Metric |   right Mean |   center Mean |   p-values | Significances   |
|---------:|-------------:|--------------:|-----------:|:----------------|
|        1 |      3.85185 |       3.93333 |    0.74367 | Not Significant |

## attitude_towards_teamwork - left vs center

|   Metric |   left Mean |   center Mean |    p-values | Significances   |
|---------:|------------:|--------------:|------------:|:----------------|
|        1 |     5.23077 |       3.93333 | 1.98748e-07 | Significant     |

## work_motivation - right vs left

|   Metric |   right Mean |   left Mean |   p-values | Significances   |
|---------:|-------------:|------------:|-----------:|:----------------|
|        2 |      4.18519 |     5.30769 | 0.00314621 | Significant     |

## work_motivation - right vs center

|   Metric |   right Mean |   center Mean |   p-values | Significances   |
|---------:|-------------:|--------------:|-----------:|:----------------|
|        2 |      4.18519 |       4.28889 |    0.67845 | Not Significant |

## work_motivation - left vs center

|   Metric |   left Mean |   center Mean |    p-values | Significances   |
|---------:|------------:|--------------:|------------:|:----------------|
|        2 |     5.30769 |       4.28889 | 0.000106567 | Significant     |

## connection_to_workplace - right vs left

|   Metric |   right Mean |   left Mean |   p-values | Significances   |
|---------:|-------------:|------------:|-----------:|:----------------|
|        4 |      4.77778 |     5.15385 |   0.143641 | Not Significant |

## connection_to_workplace - right vs center

|   Metric |   right Mean |   center Mean |   p-values | Significances   |
|---------:|-------------:|--------------:|-----------:|:----------------|
|        4 |      4.77778 |       4.73333 |   0.820374 | Not Significant |

## connection_to_workplace - left vs center

|   Metric |   left Mean |   center Mean |   p-values | Significances   |
|---------:|------------:|--------------:|-----------:|:----------------|
|        4 |     5.15385 |       4.73333 |   0.101263 | Not Significant |

## work_relationships - right vs left

|   Metric |   right Mean |   left Mean |   p-values | Significances   |
|---------:|-------------:|------------:|-----------:|:----------------|
|        5 |      4.92593 |     4.92308 |   0.991681 | Not Significant |
|        6 |      4.51852 |     4.76923 |   0.451248 | Not Significant |
|       17 |      4.77778 |     5.07692 |   0.297108 | Not Significant |

## work_relationships - right vs center

|   Metric |   right Mean |   center Mean |   p-values | Significances   |
|---------:|-------------:|--------------:|-----------:|:----------------|
|        5 |      4.92593 |       4.4     | 0.00978461 | Significant     |
|        6 |      4.51852 |       3.75556 | 0.00232643 | Significant     |
|       17 |      4.77778 |       4.15556 | 0.00472306 | Significant     |

## work_relationships - left vs center

|   Metric |   left Mean |   center Mean |    p-values | Significances   |
|---------:|------------:|--------------:|------------:|:----------------|
|        5 |     4.92308 |       4.4     | 0.0384431   | Significant     |
|        6 |     4.76923 |       3.75556 | 0.00143112  | Significant     |
|       17 |     5.07692 |       4.15556 | 0.000453856 | Significant     |

## Self_identity - right vs center

| Metric       |   right Mean |   center Mean |    p-values | Significances   |
|:-------------|-------------:|--------------:|------------:|:----------------|
| 7            |      5.74074 |       4.95556 | 2.39171e-06 | Significant     |
| Jewish Pride |      5.81481 |       5       | 5.84491e-07 | Significant     |

## attitude towards outgroup - right vs left

|   Metric |   right Mean |   left Mean |    p-values | Significances   |
|---------:|-------------:|------------:|------------:|:----------------|
|        8 |      3.33333 |     4.53846 | 0.00180832  | Significant     |
|        9 |      3       |     4.61538 | 2.46723e-06 | Significant     |
|       10 |      2.07407 |     2.84615 | 0.00619917  | Significant     |
|       11 |      3.22222 |     4.84615 | 2.41754e-07 | Significant     |
|       12 |      3.07407 |     4.69231 | 4.12054e-07 | Significant     |
|       13 |      1.7037  |     4.46154 | 1.16935e-16 | Significant     |
|       15 |      4.22222 |     5.15385 | 0.020602    | Significant     |
